from retrieval.sentence import FEVERSentenceRelatednessFormatter, FEVERSentenceFormatter, FEVERSentenceTextFormatter from scripts.retrieval.sentence.mlp_train import RelatedLabelSchema nlp = spacy.load("en", create_pipeline=wmd.WMD.create_spacy_pipeline) def wmd_sim(claim, lines): cl = nlp(claim) scores = [] for line in lines: scores.append(cl.similarity(nlp(line))) return scores db = FeverDocDB("data/fever/fever.db") idx = set(db.get_doc_ids()) jlr = JSONLineReader() formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema()) dev_ds = DataSet(file="data/fever-data/dev.jsonl", reader=jlr, formatter=formatter) dev_ds.read() def doc_lines(db, doc): lines = db.get_doc_lines(doc) return [ line.split("\t")[1] if len(line.split("\t")) > 1 else "" for line in lines.split("\n")
if len(evidence_pages) < k: samples = random.sample(doc_ids, k - len(evidence_pages)) for sample in samples: evidence_pages.add(sample) elif len(evidence_pages) >= k: samples = random.sample(evidence_pages, k) evidence_pages = set(samples) return evidence_pages path = os.getcwd() path = re.sub("/src.*", "", path) db = FeverDocDB(os.path.join(path, "data/fever/fever.db")) doc_ids = db.get_doc_ids() doc_ids = doc_ids[1:] jlr = JSONLineReader() # with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f: # with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2: # lines = f.readlines() # for line in lines: # js = json.loads(line) # pages = sample_doc(js,doc_ids,k=5) # js['predicted_pages'] = list(pages) # f2.write(json.dumps(js)+"\n") with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f: with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2: lines = f.readlines() for line in lines:
voc_dict[v] = i + 2 voc_dict['PAD'] = 0 voc_dict['UNK'] = 1 return voc_dict if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('db', help='/path/to/db/file') parser.add_argument('output', help='/path/to/output/pickle/file') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("generate_vocab_all_wiki") db = FeverDocDB(args.db) vocab = set() for doc in tqdm(db.get_doc_ids()): lines = db.get_doc_lines(doc) lines = lines.split("\n") for line in lines: segments = line.split("\t") if len(segments) < 2: continue line = segments[1] if line.strip() == "": continue tokens = set(token.lower() for token in tokenize(clean_text(line))) vocab.update(tokens) logger.info("total size of vocab: " + str(len(vocab))) vocab_dict = vocab_map(vocab) del vocab with open(args.output, 'wb') as f: