Example #1
0
def get_train_data(data,
                   tokenizer,
                   lm,
                   num_examples=None,
                   mask=True,
                   distant_source=None):
    if data == "beforeafter":
        print("Using BeforeAfter train examples from Gigaword.")
        exs, data = beforeafter_examples(tokenizer,
                                         lm=lm,
                                         num_examples=num_examples,
                                         mask=mask,
                                         during=False)
    elif data == "beforeafter_yelp":
        print("Using BeforeAfter train examples from Yelp.")
        exs, data = beforeafter_examples(tokenizer,
                                         lm=lm,
                                         ext="_yelp",
                                         num_examples=num_examples,
                                         mask=mask)
    elif data == "matres":
        print("Using MATRES train examples.")
        exs, data, feats = matres_train_examples(tokenizer, lm=lm)
        if num_examples:
            idxs = random.sample(list(range(len(exs))), num_examples)
            exs = [exs[i] for i in idxs]
            feats = [feats[i] for i in idxs]
            data = make_tensor_dataset(feats, model=lm)
    elif data == "distant":
        print("Using DistantTimex train examples.")
        exs, data = distant_train_examples(tokenizer,
                                           lm=lm,
                                           source=distant_source,
                                           mask=mask,
                                           num_examples=num_examples)
    elif data == "udst":
        print("Using UDS-T train examples.")
        exs, data, feats = udst(tokenizer, lm=lm, split="train")
        if num_examples:
            idxs = random.sample(list(range(len(exs))), num_examples)
            exs = [exs[i] for i in idxs]
            feats = [feats[i] for i in idxs]
            data = make_tensor_dataset(feats, model=lm)
    elif data.endswith(".pkl"):
        inputs = pickle.load(open(data, "rb"))
        exs = inputs["exs"]
        data = inputs["data"]
    else:
        raise RuntimeError("Please specifify valid data source.")
    return exs, data
Example #2
0
def get_dummy_data(tokenizer, lm):
    sent1 = "Today I went to the store.".split()
    sent2 = "I came home.".split()
    # load_data.py:IndexedExamplePartial
    #        - for single-sentence examples sent1&sent2 are the same
    #            --> pass in SAME LIST (reference/pointer) for both
    ex = IndexedExamplePartial(
        label="BEFORE",  # {AFTER, BEFORE, EQUALS, VAGUE}
        sent1=sent1,
        sent2=sent2,
        tags1=None,  # none, unless you want to mask timexes
        tags2=None,
        e1_idx=2,  # "went" = sent1[e1_idx]
        e2_idx=1,  # "came" = sent2[e2_idx]
        doc_name=None)  # specify doc_name if you need it later

    exs = [ex]

    # load_data.py:convert_distant_examples_to_features
    #     - should automatically generate the right model-specific
    #       input features according to tokenizer type
    feats = convert_distant_examples_to_features(examples=exs,
                                                 tokenizer=tokenizer,
                                                 max_seq_length=MAX_SEQ_LENGTH,
                                                 doc_stride=DOC_STRIDE)

    data = make_tensor_dataset(feats, model=lm)
    return exs, data
Example #3
0
def matres_train_examples(tokenizer,
                          lm='roberta',
                          mask_events=False,
                          mask_context=False):
    train_examples, _ = matres_examples()

    train_examples, train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context)
    train_data = make_tensor_dataset(train_features, model=lm)
    return train_examples, train_data, train_features
Example #4
0
def udst(tokenizer,
         lm='roberta',
         split="train",
         example_dir="udst/all_annotations/",
         mask_events=False,
         mask_context=False):
    exs = parse_udst.get_examples(example_dir=example_dir, split=split)
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=False,
                                              mask_events=mask_events,
                                              mask_context=mask_context)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data, feats
Example #5
0
def matres_dev_examples(tokenizer,
                        lm='roberta',
                        mask_events=False,
                        mask_context=False):
    _, dev_examples = matres_examples()

    dev_examples, dev_features = convert_examples_to_features(
        examples=dev_examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context,
        id_prefix="md")
    dev_data = make_tensor_dataset(dev_features, model=lm)
    return dev_examples, dev_data
Example #6
0
def udst_majority(tokenizer,
                  lm='roberta',
                  example_dir="udst/all_annotations/",
                  split="dev",
                  mask_events=False,
                  ties=True):
    exs = parse_udst.get_majority_examples(example_dir=example_dir,
                                           split=split,
                                           ties=ties)
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=False,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data
Example #7
0
def distant_test_examples(tokenizer,
                          lm='roberta',
                          train=False,
                          mask=False,
                          mask_events=False):
    f = open('timex/orig/test_exs.pkl', 'rb')
    exs = pickle.load(f)
    if mask:
        mask = 'distant'
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=mask,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data
Example #8
0
def matres_test_examples(tokenizer,
                         lm='roberta',
                         mask_events=False,
                         mask_context=False):
    loader = MatresLoader()
    examples = loader.read_test_examples(doc_dir="timebank/te3-platinum/",
                                         rel_dir="timebank/MATRES/")

    examples, features = convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context,
        id_prefix="mt")
    data = make_tensor_dataset(features, model=lm)
    return examples, data
Example #9
0
def distant_train_examples(tokenizer,
                           lm='roberta',
                           source=None,
                           ext='',
                           num_examples=None,
                           mask=False,
                           random_mask=False,
                           mask_events=False):
    f = open('timex/orig/train_exs.pkl', 'rb')
    exs = pickle.load(f)
    if source == "even":
        exs = filter_distant_source(exs,
                                    num_examples=num_examples / 6,
                                    source="afp")
    else:
        exs = filter_distant_source(exs, source)

    if num_examples:
        if num_examples > len(exs):
            more_examples = _distant_parsed_examples(
                tokenizer,
                source=source,
                ext='',
                num_examples=num_examples - len(exs))
            exs += more_examples
        exs = exs[:num_examples]

    if random_mask:
        exs = apply_random_mask(exs, tokenizer)
    if mask:
        mask = 'distant'
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=mask,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data
Example #10
0
def beforeafter_examples(tokenizer,
                         lm='roberta',
                         ext='',
                         num_examples=None,
                         mask=False,
                         during=True):
    g_train_examples = get_beforeafter_examples(
        EXAMPLE_DIR="beforeafter/examples" + ext + "/",
        num_examples=num_examples,
        during=during)
    if mask:
        mask = 'beforeafter'
        print(mask)
    g_train_examples, g_train_features = convert_examples_to_features(
        examples=g_train_examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask='beforeafter')
    g_train_data = make_tensor_dataset(g_train_features, model=lm)

    return g_train_examples, g_train_data
Example #11
0
def distant_parsed_examples(tokenizer,
                            lm='roberta',
                            ext='',
                            num_examples=None,
                            mask=False,
                            random_mask=False,
                            mask_events=False):
    exs = _distant_parsed_examples(tokenizer,
                                   ext=ext,
                                   num_examples=num_examples)
    if random_mask:
        exs = apply_random_mask(exs, tokenizer)
    if mask:
        mask = 'distant'
    print(len(exs), mask)
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=mask,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data