def __init__(self, db: FeverDocDB, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.db = db self.formatter = FEVERSentenceFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema()) self.reader = JSONLineReader()
class FEVERSentenceRelatednessFormatter(FeverFormatter): def __init__(self, idx, db, ls): super().__init__(idx, ls) self.label_schema = ls self.ols = FEVERLabelSchema() self.db = db def format_line(self, line): annotation = line["label"] if annotation is None: annotation = line["verifiable"] if self.ols.get_id(annotation) != self.ols.get_id("not enough info"): annotation = "related" else: annotation = "unrelated" evidence_texts = [] claim = self.tokenize(line['claim']).strip() for page in set([ev[2] for ev in line['evidence']]): evidences = set( [ev[3] for ev in line['evidence'] if ev[1] == page]) lines = self.db.get_doc_lines(page) if any(ev < 0 for ev in evidences): evidence_texts = [""] else: evidence_texts = [ lines.split("\n")[line].split("\t")[1].split() for line in evidences ] return { "claim": claim, "sentences": evidence_texts, "label": self.label_schema.get_id(annotation), "label_text": annotation }
def __init__(self, db: FeverDocDB, sentence_level=False, wiki_tokenizer: Tokenizer = None, claim_tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ner_facts=False, filtering: str = None) -> None: self._sentence_level = sentence_level self._ner_facts = ner_facts self._wiki_tokenizer = wiki_tokenizer or WordTokenizer() self._claim_tokenizer = claim_tokenizer or WordTokenizer() self._token_indexers = token_indexers or { 'elmo': ELMoTokenCharactersIndexer(), 'tokens': SingleIdTokenIndexer() } self.db = db self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(), filtering=filtering) self.reader = JSONLineReader()
mname = args.model logger.info("Model name is {0}".format(mname)) ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db,naming=mname)) f = Features(mname,ffns) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema(),filtering=args.filtering) train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter) dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() test_ds = None if args.test is not None: test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds) f.save_vocab(mname)
help=("String option specifying tokenizer type to use " "(e.g. 'corenlp')")) parser.add_argument('--num-workers', type=int, default=None, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() doc_freqs = None if args.use_precomputed: _, metadata = utils.load_sparse_csr(args.model) doc_freqs = metadata['doc_freqs'].squeeze() db = FeverDocDB("data/fever/fever.db") jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema()) jlr = JSONLineReader() with open(args.in_file, "r") as f, open( "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format( args.split, args.max_page, args.max_sent, "precomputed" if args.use_precomputed else "not_precomputed"), "w+") as out_file: lines = jlr.process(f) #lines = tf_idf_claims_batch(lines) for line in tqdm(lines): line = tf_idf_claim(line) out_file.write(json.dumps(line) + "\n")
def __init__(self, db: FeverDocDB) -> None: self.db = db self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema()) self.reader = JSONLineReader()
logger.info("Model name is {0}".format(mname)) ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db, naming=mname)) f = Features(mname, ffns) f.load_vocab(mname) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema()) test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() feats = f.lookup(test_ds) input_shape = feats[0].shape[1] model = SimpleMLP(input_shape, 100, 3) if gpu(): model.cuda() model.load_state_dict(torch.load("models/{0}.model".format(mname))) print_evaluation(model, feats, FEVERLabelSchema(), args.log)
def __init__(self, idx, db, ls): super().__init__(idx, ls) self.label_schema = ls self.ols = FEVERLabelSchema() self.db = db
ffns = [] if args.sentence: logger.info("Model is Sentence level") ffns.append(SentenceLevelTermFrequencyFeatureFunction(db, naming=mname)) else: logger.info("Model is Document level") ffns.append(TermFrequencyFeatureFunction(db, naming=mname)) f = Features(mname, ffns) jlr = JSONLineReader() formatter = FEVERGoldFormatter(None, FEVERLabelSchema(), filtering=args.filtering) train_ds = DataSet(file=args.train, reader=jlr, formatter=formatter) dev_ds = DataSet(file=args.dev, reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() test_ds = None if args.test is not None: test_ds = DataSet(file=args.test, reader=jlr, formatter=formatter) test_ds.read() train_feats, dev_feats, test_feats = f.load(train_ds, dev_ds, test_ds)
if __name__ == "__main__": db = FeverDocDB("data/fever/drqa.db") idx = set(db.get_doc_ids()) fnc_bodies = Bodies("data/fnc-1/train_bodies.csv", "data/fnc-1/competition_test_bodies.csv") fever_bodies = db f = Features( [FeverOrFNCTermFrequencyFeatureFunction(fever_bodies, fnc_bodies)]) csvr = CSVReader() jlr = JSONLineReader() fnc_formatter = FNCFormatter2(FNCSimpleLabelSchema()) fever_formatter = FEVERPredictionsFormatter(idx, FEVERLabelSchema()) train_ds = DataSet(file="data/fnc-1/train_stances.csv", reader=csvr, formatter=fnc_formatter) dev_ds = DataSet(file="data/fnc-1/competition_test_stances.csv", reader=csvr, formatter=fnc_formatter) test_ds = DataSet(file="data/fever/fever.dev.pages.p5.jsonl", reader=jlr, formatter=fever_formatter) train_ds.read() test_ds.read() dev_ds.read()
LogHelper.get_logger(__name__) parser = argparse.ArgumentParser() parser.add_argument('db', type=str, help='/path/to/saved/db.db') parser.add_argument('in_file', type=str, help='/path/to/saved/db.db') parser.add_argument('--max_page',type=int) parser.add_argument('--max_sent',type=int) parser.add_argument('--split', type=str) parser.add_argument("--filtering",type=str,default=None) args = parser.parse_args() db = FeverDocDB("data/fever/fever.db") jlr = JSONLineReader() formatter = FEVERGoldFormatter(set(), FEVERLabelSchema(),filtering=args.filtering) train_ds = DataSet(file="data/fever/train.ns.pages.p{0}.jsonl".format(1), reader=jlr, formatter=formatter) dev_ds = DataSet(file="data/fever/dev.pages.p{0}.jsonl".format(args.max_page), reader=jlr, formatter=formatter) train_ds.read() dev_ds.read() tf = XTermFrequencyFeatureFunction(db) tf.inform(train_ds.data, dev_ds.data) jlr = JSONLineReader() with open(args.in_file,"r") as f: lines = jlr.process(f)