def read(self, file_path: str): instances = [] ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] instances.append(self.text_to_instance(premise, hypothesis, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str) -> Iterator[Instance]: ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if self._ner_facts: premise = premise + " ".join(instance['fact']) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] ner_missing = instance["ner_missing"] yield self.text_to_instance(premise, hypothesis, label, ner_missing)
def annotation_on_the_fly(self, file_path, run_name, objUOFADataReader): print("do_annotation_on_the_fly == true") # DELETE THE annotated file IF IT EXISTS every time before the loop # self.delete_if_exists(head_file) # self.delete_if_exists(body_file) if (run_name == "train"): print("run_name == train") head_file = objUOFADataReader.ann_head_tr body_file = objUOFADataReader.ann_body_tr else: if (run_name == "dev"): print("run_name == dev") head_file = objUOFADataReader.ann_head_dev body_file = objUOFADataReader.ann_body_dev ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() instances = [] for instance in tqdm.tqdm(ds.data): counter = counter + 1 if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] premise_ann, hypothesis_ann = self.uofa_annotate( hypothesis, premise, counter, objUOFADataReader, head_file, body_file) instances.append( self.text_to_instance(premise_ann, hypothesis_ann, label)) return instances
def read(self, file_path: str): instances = [] ds = FEVERDataSet(file_path,reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue for page in set([ev[0] for ev in instance['evidence']]): claim = instance['claim'].strip() paragraph = self.db.get_doc_text(page) tokenized_paragraph = self._wiki_tokenizer.tokenize(paragraph) evidences = set([ev[1] for ev in instance['evidence'] if ev[0] == page]) lines = self.db.get_doc_lines(page) if any(ev<0 for ev in evidences): span_ends = [0] span_starts = [0] evidence_texts = [""] else: evidence_texts = [lines.split("\n")[line].split("\t")[1] for line in evidences] span_starts = [paragraph.index(evidence_text) for evidence_text in evidence_texts] span_ends = [start + len(evidence_texts) for start, evidence_text in zip(span_starts, evidence_texts)] inst = self.text_to_instance(claim, paragraph, zip(span_starts, span_ends), evidence_texts, tokenized_paragraph) instances.append(inst) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db,naming=mname)) f = Features(mname,ffns) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering) train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter) dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() test_ds = None if args.test is not None: test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds) f.save_vocab(mname) input_shape = train_feats[0].shape[1] model = SimpleMLP(input_shape,100,3) if gpu():
waseem_de_composite = CompositeDataset(name="waseem_composite_dev") for dataset in datasets_de: dataset.read() waseem_de_composite.add(dataset) waseem_te_composite = CompositeDataset(name="waseem_composite_test") for dataset in datasets_te: dataset.read() waseem_te_composite.add(dataset) davidson_tr = DataSet(os.path.join("data", "davidson.tr.csv"), reader=csvreader, formatter=df, name="davidson_train") davidson_tr.read() davidson_dv = DataSet(os.path.join("data", "davidson.dv.csv"), reader=csvreader, formatter=df, name="davidson_dev") davidson_dv.read() davidson_te = DataSet(os.path.join("data", "davidson.te.csv"), reader=csvreader, formatter=df, name="davidson_test") davidson_te.read() features = Features(get_feature_functions(mname)) primary_train_fs, aux_train_fs, dev_fs, test_fs_primary, test_fs_aux = features.load(
scores = [] for line in lines: scores.append(cl.similarity(nlp(line))) return scores db = FeverDocDB("data/fever/fever.db") idx = set(db.get_doc_ids()) jlr = JSONLineReader() formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema()) dev_ds = DataSet(file="data/fever-data/dev.jsonl", reader=jlr, formatter=formatter) dev_ds.read() def doc_lines(db, doc): lines = db.get_doc_lines(doc) return [ line.split("\t")[1] if len(line.split("\t")) > 1 else "" for line in lines.split("\n") ] #thresh = 0.8 y_true = [] y_scores = []
waseem_tr_composite.add(dataset) waseem_de_composite = CompositeDataset(name="waseem_composite_dev") for dataset in datasets_de: dataset.read() waseem_de_composite.add(dataset) waseem_te_composite = CompositeDataset(name="waseem_composite_test") for dataset in datasets_te: dataset.read() waseem_te_composite.add(dataset) davidson_te = DataSet(os.path.join("data","davidson.te.csv"),reader=csvreader,formatter=df,name="davidson_test") davidson_te.read() features = Features(get_feature_functions(mname)) train_fs, dev_fs, test_fs_primary, test_fs_aux = features.load(waseem_tr_composite, waseem_de_composite, waseem_te_composite, davidson_te) print("Number of features in primary: {0}".format(train_fs[0].shape[1])) model = MLP(train_fs[0].shape[1],get_model_shape(),3) if gpu(): model.cuda() if model_exists(mname) and os.getenv("TRAIN").lower() not in ["y","1","t","yes"]: model.load_state_dict(torch.load("models/{0}.model".format(mname))) else: train(model, train_fs, 50, 1e-3, 45, dev=dev_fs,
logger.info("Model name is {0}".format(mname)) ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db, naming=mname)) f = Features(mname, ffns) f.load_vocab(mname) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema()) test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() feats = f.lookup(test_ds) input_shape = feats[0].shape[1] model = SimpleMLP(input_shape, 100, 3) if gpu(): model.cuda() model.load_state_dict(torch.load("models/{0}.model".format(mname))) print_evaluation(model, feats, FEVERLabelSchema(), args.log)
df = DavidsonFormatter(DavidsonToZLabelSchema(), preprocessing=pp) davidson_tr_dataset = DataSet(os.path.join("data", "davidson.tr.csv"), formatter=df, reader=csvreader, name="davidson_train") davidson_dv_dataset = DataSet(os.path.join("data", "davidson.dv.csv"), formatter=df, reader=csvreader, name="davidson_dev") davidson_te_dataset = DataSet(os.path.join("data", "davidson.te.csv"), formatter=df, reader=csvreader, name="davidson_test") davidson_tr_dataset.read() davidson_dv_dataset.read() davidson_te_dataset.read() features = Features(get_feature_functions(mname)) train_fs, dev_fs, test_fs = features.load(davidson_tr_dataset, davidson_dv_dataset, davidson_te_dataset) print("Number of features: {0}".format(train_fs[0].shape[1])) model = MLP(train_fs[0].shape[1], get_model_shape(), 3) if gpu(): model.cuda() if model_exists(mname) and os.getenv("TRAIN").lower() not in [