def get_train_data(data, tokenizer, lm, num_examples=None, mask=True, distant_source=None): if data == "beforeafter": print("Using BeforeAfter train examples from Gigaword.") exs, data = beforeafter_examples(tokenizer, lm=lm, num_examples=num_examples, mask=mask, during=False) elif data == "beforeafter_yelp": print("Using BeforeAfter train examples from Yelp.") exs, data = beforeafter_examples(tokenizer, lm=lm, ext="_yelp", num_examples=num_examples, mask=mask) elif data == "matres": print("Using MATRES train examples.") exs, data, feats = matres_train_examples(tokenizer, lm=lm) if num_examples: idxs = random.sample(list(range(len(exs))), num_examples) exs = [exs[i] for i in idxs] feats = [feats[i] for i in idxs] data = make_tensor_dataset(feats, model=lm) elif data == "distant": print("Using DistantTimex train examples.") exs, data = distant_train_examples(tokenizer, lm=lm, source=distant_source, mask=mask, num_examples=num_examples) elif data == "udst": print("Using UDS-T train examples.") exs, data, feats = udst(tokenizer, lm=lm, split="train") if num_examples: idxs = random.sample(list(range(len(exs))), num_examples) exs = [exs[i] for i in idxs] feats = [feats[i] for i in idxs] data = make_tensor_dataset(feats, model=lm) elif data.endswith(".pkl"): inputs = pickle.load(open(data, "rb")) exs = inputs["exs"] data = inputs["data"] else: raise RuntimeError("Please specifify valid data source.") return exs, data
def get_dummy_data(tokenizer, lm): sent1 = "Today I went to the store.".split() sent2 = "I came home.".split() # load_data.py:IndexedExamplePartial # - for single-sentence examples sent1&sent2 are the same # --> pass in SAME LIST (reference/pointer) for both ex = IndexedExamplePartial( label="BEFORE", # {AFTER, BEFORE, EQUALS, VAGUE} sent1=sent1, sent2=sent2, tags1=None, # none, unless you want to mask timexes tags2=None, e1_idx=2, # "went" = sent1[e1_idx] e2_idx=1, # "came" = sent2[e2_idx] doc_name=None) # specify doc_name if you need it later exs = [ex] # load_data.py:convert_distant_examples_to_features # - should automatically generate the right model-specific # input features according to tokenizer type feats = convert_distant_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE) data = make_tensor_dataset(feats, model=lm) return exs, data
def matres_train_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): train_examples, _ = matres_examples() train_examples, train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context) train_data = make_tensor_dataset(train_features, model=lm) return train_examples, train_data, train_features
def udst(tokenizer, lm='roberta', split="train", example_dir="udst/all_annotations/", mask_events=False, mask_context=False): exs = parse_udst.get_examples(example_dir=example_dir, split=split) exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=False, mask_events=mask_events, mask_context=mask_context) data = make_tensor_dataset(feats, model=lm) return exs, data, feats
def matres_dev_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): _, dev_examples = matres_examples() dev_examples, dev_features = convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context, id_prefix="md") dev_data = make_tensor_dataset(dev_features, model=lm) return dev_examples, dev_data
def udst_majority(tokenizer, lm='roberta', example_dir="udst/all_annotations/", split="dev", mask_events=False, ties=True): exs = parse_udst.get_majority_examples(example_dir=example_dir, split=split, ties=ties) exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=False, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data
def distant_test_examples(tokenizer, lm='roberta', train=False, mask=False, mask_events=False): f = open('timex/orig/test_exs.pkl', 'rb') exs = pickle.load(f) if mask: mask = 'distant' exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=mask, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data
def matres_test_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): loader = MatresLoader() examples = loader.read_test_examples(doc_dir="timebank/te3-platinum/", rel_dir="timebank/MATRES/") examples, features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context, id_prefix="mt") data = make_tensor_dataset(features, model=lm) return examples, data
def distant_train_examples(tokenizer, lm='roberta', source=None, ext='', num_examples=None, mask=False, random_mask=False, mask_events=False): f = open('timex/orig/train_exs.pkl', 'rb') exs = pickle.load(f) if source == "even": exs = filter_distant_source(exs, num_examples=num_examples / 6, source="afp") else: exs = filter_distant_source(exs, source) if num_examples: if num_examples > len(exs): more_examples = _distant_parsed_examples( tokenizer, source=source, ext='', num_examples=num_examples - len(exs)) exs += more_examples exs = exs[:num_examples] if random_mask: exs = apply_random_mask(exs, tokenizer) if mask: mask = 'distant' exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=mask, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data
def beforeafter_examples(tokenizer, lm='roberta', ext='', num_examples=None, mask=False, during=True): g_train_examples = get_beforeafter_examples( EXAMPLE_DIR="beforeafter/examples" + ext + "/", num_examples=num_examples, during=during) if mask: mask = 'beforeafter' print(mask) g_train_examples, g_train_features = convert_examples_to_features( examples=g_train_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask='beforeafter') g_train_data = make_tensor_dataset(g_train_features, model=lm) return g_train_examples, g_train_data
def distant_parsed_examples(tokenizer, lm='roberta', ext='', num_examples=None, mask=False, random_mask=False, mask_events=False): exs = _distant_parsed_examples(tokenizer, ext=ext, num_examples=num_examples) if random_mask: exs = apply_random_mask(exs, tokenizer) if mask: mask = 'distant' print(len(exs), mask) exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=mask, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data