def read(self, file_path: str): instances = [] ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] instances.append(self.text_to_instance(premise, hypothesis, label)) if not instances: raise ConfigurationError( "No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
def _read(self, file_path: str) -> Iterator[Instance]: ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if self._ner_facts: premise = premise + " ".join(instance['fact']) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] ner_missing = instance["ner_missing"] yield self.text_to_instance(premise, hypothesis, label, ner_missing)
def annotation_on_the_fly(self, file_path, run_name, objUOFADataReader): print("do_annotation_on_the_fly == true") # DELETE THE annotated file IF IT EXISTS every time before the loop # self.delete_if_exists(head_file) # self.delete_if_exists(body_file) if (run_name == "train"): print("run_name == train") head_file = objUOFADataReader.ann_head_tr body_file = objUOFADataReader.ann_body_tr else: if (run_name == "dev"): print("run_name == dev") head_file = objUOFADataReader.ann_head_dev body_file = objUOFADataReader.ann_body_dev ds = FEVERDataSet(file_path, reader=self.reader, formatter=self.formatter) ds.read() instances = [] for instance in tqdm.tqdm(ds.data): counter = counter + 1 if instance is None: continue if not self._sentence_level: pages = set(ev[0] for ev in instance["evidence"]) premise = " ".join([self.db.get_doc_text(p) for p in pages]) else: lines = set([ self.get_doc_line(d[0], d[1]) for d in instance['evidence'] ]) premise = " ".join(lines) if len(premise.strip()) == 0: premise = "" hypothesis = instance["claim"] label = instance["label_text"] premise_ann, hypothesis_ann = self.uofa_annotate( hypothesis, premise, counter, objUOFADataReader, head_file, body_file) instances.append( self.text_to_instance(premise_ann, hypothesis_ann, label)) return instances
def read(self, file_path: str): instances = [] ds = FEVERDataSet(file_path,reader=self.reader, formatter=self.formatter) ds.read() for instance in tqdm.tqdm(ds.data): if instance is None: continue for page in set([ev[0] for ev in instance['evidence']]): claim = instance['claim'].strip() paragraph = self.db.get_doc_text(page) tokenized_paragraph = self._wiki_tokenizer.tokenize(paragraph) evidences = set([ev[1] for ev in instance['evidence'] if ev[0] == page]) lines = self.db.get_doc_lines(page) if any(ev<0 for ev in evidences): span_ends = [0] span_starts = [0] evidence_texts = [""] else: evidence_texts = [lines.split("\n")[line].split("\t")[1] for line in evidences] span_starts = [paragraph.index(evidence_text) for evidence_text in evidence_texts] span_ends = [start + len(evidence_texts) for start, evidence_text in zip(span_starts, evidence_texts)] inst = self.text_to_instance(claim, paragraph, zip(span_starts, span_ends), evidence_texts, tokenized_paragraph) instances.append(inst) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db,naming=mname)) f = Features(mname,ffns) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering) train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter) dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() test_ds = None if args.test is not None: test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds) f.save_vocab(mname) input_shape = train_feats[0].shape[1]
'-o', '--overrides', type=str, default="", help='a HOCON structure used to override the experiment configuration') args = parser.parse_args() logger.info("Load DB") db = FeverDocDB(args.db) jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) logger.info("Read datasets") train_ds = DataSet(file="data/fever/train.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter) dev_ds = DataSet(file="data/fever/dev.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() logger.info("Generate vocab for TF-IDF") tf = XTermFrequencyFeatureFunction(db) tf.inform(train_ds.data, dev_ds.data) logger.info("Eval") eval_model(db, args)
0: 0, 1: 1, 2: 2, 3: 0 }) df = DavidsonFormatter(DavidsonToZLabelSchema(), preprocessing=pp, mapping={ 0: 0, 1: 1, 2: 2 }) datasets_tr = [ DataSet(file=sexism_file_tr, reader=jlr, formatter=formatter, name=None), DataSet(file=racism_file_tr, reader=jlr, formatter=formatter, name=None), DataSet(file=neither_file_tr, reader=jlr, formatter=formatter, name=None), DataSet(file=waseem_hovy_tr, reader=jlr, formatter=formatter2, name=None) ]
"claim": line["Headline"], "evidence": line["Body ID"], "label": annotation } if __name__ == "__main__": bodies = Bodies("data/fnc-1/train_bodies.csv", "data/fnc-1/competition_test_bodies.csv") f = Features([FNCTermFrequencyFeatureFunction(bodies)]) csvr = CSVReader() formatter = FNCFormatter(FNCLabelSchema()) train_ds = DataSet(file="data/fnc-1/train_stances.csv", reader=csvr, formatter=formatter) test_ds = DataSet(file="data/fnc-1/competition_test_stances.csv", reader=csvr, formatter=formatter) train_ds.read() test_ds.read() train_feats, _, test_feats = f.load(train_ds, None, test_ds) input_shape = train_feats[0].shape[1] model = SimpleMLP(input_shape, 100, 4) if gpu(): model.cuda()
def wmd_sim(claim, lines): cl = nlp(claim) scores = [] for line in lines: scores.append(cl.similarity(nlp(line))) return scores db = FeverDocDB("data/fever/fever.db") idx = set(db.get_doc_ids()) jlr = JSONLineReader() formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema()) dev_ds = DataSet(file="data/fever-data/dev.jsonl", reader=jlr, formatter=formatter) dev_ds.read() def doc_lines(db, doc): lines = db.get_doc_lines(doc) return [ line.split("\t")[1] if len(line.split("\t")) > 1 else "" for line in lines.split("\n") ] #thresh = 0.8
logger.info("Model name is {0}".format(mname)) ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db, naming=mname)) f = Features(mname, ffns) f.load_vocab(mname) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema()) test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() feats = f.lookup(test_ds) input_shape = feats[0].shape[1] model = SimpleMLP(input_shape, 100, 3) if gpu(): model.cuda() model.load_state_dict(torch.load("models/{0}.model".format(mname))) print_evaluation(model, feats, FEVERLabelSchema(), args.log)
waseem_hovy_de = os.path.join("data","amateur_expert.dv.json") sexism_file_te = os.path.join("data","waseem_s.te.json") racism_file_te = os.path.join("data","waseem_r.te.json") neither_file_te = os.path.join("data","waseem_n.te.json") waseem_hovy_te = os.path.join("data","amateur_expert.te.json") csvreader = CSVReader(encoding="ISO-8859-1") jlr = JSONLineReader() formatter = TextAnnotationFormatter(WaseemLabelSchema(),preprocessing=pp) formatter2 = TextAnnotationFormatter(WaseemHovyLabelSchema(),preprocessing=pp,mapping={0:0,1:1,2:2,3:0}) df = DavidsonFormatter(DavidsonToZLabelSchema(),preprocessing=pp,mapping={0:0,1:1,2:2}) datasets_tr = [ DataSet(file=sexism_file_tr, name=None, reader=jlr, formatter=formatter), DataSet(file=racism_file_tr, name=None, reader=jlr, formatter=formatter), DataSet(file=neither_file_tr, name=None, reader=jlr, formatter=formatter), DataSet(file=waseem_hovy_tr, name=None, reader=jlr, formatter=formatter2) ] datasets_de = [ DataSet(file=sexism_file_de, name=None, reader=jlr, formatter=formatter), DataSet(file=racism_file_de, name=None, reader=jlr, formatter=formatter), DataSet(file=neither_file_de, name=None, reader=jlr, formatter=formatter), DataSet(file=waseem_hovy_de, name=None,reader=jlr, formatter=formatter2) ] datasets_te = [ DataSet(file=sexism_file_te, name=None, reader=jlr, formatter=formatter), DataSet(file=racism_file_te, name=None, reader=jlr, formatter=formatter),
maxdoc = sys.argv[1] ns_docsize = sys.argv[2] db = FeverDocDB("data/fever/fever.db") idx = set(db.get_doc_ids()) mname = "2way-p{0}-{1}".format(maxdoc, ns_docsize) f = Features([SentenceTermFrequencyFeatureFunction(db, naming=mname)]) jlr = JSONLineReader() formatter = FEVERSentenceFormatter(idx, db, RelatedLabelSchema()) train_ds = DataSet( file="data/fever/train.ns.pages.p{0}.jsonl".format(ns_docsize), reader=jlr, formatter=formatter) dev_ds = DataSet(file="data/fever/dev.pages.p{0}.jsonl".format(maxdoc), reader=jlr, formatter=formatter) test_ds = DataSet(file="data/fever/test.pages.p{0}.jsonl".format(maxdoc), reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() test_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds) input_shape = train_feats[0].shape[1]
if not os.path.exists("models"): os.mkdir("models") return os.path.exists(os.path.join("models", "{0}.model".format(mname))) if __name__ == "__main__": SimpleRandom.set_seeds() mname = "expt7" + ("emb" if is_embedding_model() else "") + ("large" if is_large_model() else "") csvreader = CSVReader(encoding="ISO-8859-1") df = DavidsonFormatter(DavidsonToZLabelSchema(), preprocessing=pp) davidson_tr_dataset = DataSet(os.path.join("data", "davidson.tr.csv"), formatter=df, reader=csvreader, name="davidson_train") davidson_dv_dataset = DataSet(os.path.join("data", "davidson.dv.csv"), formatter=df, reader=csvreader, name="davidson_dev") davidson_te_dataset = DataSet(os.path.join("data", "davidson.te.csv"), formatter=df, reader=csvreader, name="davidson_test") davidson_tr_dataset.read() davidson_dv_dataset.read() davidson_te_dataset.read() features = Features(get_feature_functions(mname))
db = FeverDocDB("data/fever/drqa.db") idx = set(db.get_doc_ids()) fnc_bodies = Bodies("data/fnc-1/train_bodies.csv", "data/fnc-1/competition_test_bodies.csv") fever_bodies = db f = Features( [FeverOrFNCTermFrequencyFeatureFunction(fever_bodies, fnc_bodies)]) csvr = CSVReader() jlr = JSONLineReader() fnc_formatter = FNCFormatter2(FNCSimpleLabelSchema()) fever_formatter = FEVERPredictionsFormatter(idx, FEVERLabelSchema()) train_ds = DataSet(file="data/fnc-1/train_stances.csv", reader=csvr, formatter=fnc_formatter) dev_ds = DataSet(file="data/fnc-1/competition_test_stances.csv", reader=csvr, formatter=fnc_formatter) test_ds = DataSet(file="data/fever/fever.dev.pages.p5.jsonl", reader=jlr, formatter=fever_formatter) train_ds.read() test_ds.read() dev_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds) input_shape = train_feats[0].shape[1]