Beispiel #1
0
from retrieval.sentence import FEVERSentenceRelatednessFormatter, FEVERSentenceFormatter, FEVERSentenceTextFormatter
from scripts.retrieval.sentence.mlp_train import RelatedLabelSchema

nlp = spacy.load("en", create_pipeline=wmd.WMD.create_spacy_pipeline)


def wmd_sim(claim, lines):
    cl = nlp(claim)
    scores = []
    for line in lines:
        scores.append(cl.similarity(nlp(line)))
    return scores


db = FeverDocDB("data/fever/fever.db")
idx = set(db.get_doc_ids())

jlr = JSONLineReader()
formatter = FEVERSentenceTextFormatter(idx, db, RelatedLabelSchema())
dev_ds = DataSet(file="data/fever-data/dev.jsonl",
                 reader=jlr,
                 formatter=formatter)

dev_ds.read()


def doc_lines(db, doc):
    lines = db.get_doc_lines(doc)
    return [
        line.split("\t")[1] if len(line.split("\t")) > 1 else ""
        for line in lines.split("\n")
Beispiel #2
0
        if len(evidence_pages) < k:
            samples = random.sample(doc_ids, k - len(evidence_pages))
            for sample in samples:
                evidence_pages.add(sample)

    elif len(evidence_pages) >= k:

        samples = random.sample(evidence_pages, k)
        evidence_pages = set(samples)
    return evidence_pages


path = os.getcwd()
path = re.sub("/src.*", "", path)
db = FeverDocDB(os.path.join(path, "data/fever/fever.db"))
doc_ids = db.get_doc_ids()
doc_ids = doc_ids[1:]
jlr = JSONLineReader()
# with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f:
#     with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2:
#         lines = f.readlines()
#         for line in lines:
#             js = json.loads(line)
#             pages = sample_doc(js,doc_ids,k=5)
#             js['predicted_pages'] = list(pages)
#             f2.write(json.dumps(js)+"\n")

with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f:
    with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2:
        lines = f.readlines()
        for line in lines:
        voc_dict[v] = i + 2
    voc_dict['PAD'] = 0
    voc_dict['UNK'] = 1
    return voc_dict


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('db', help='/path/to/db/file')
    parser.add_argument('output', help='/path/to/output/pickle/file')
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("generate_vocab_all_wiki")
    db = FeverDocDB(args.db)
    vocab = set()
    for doc in tqdm(db.get_doc_ids()):
        lines = db.get_doc_lines(doc)
        lines = lines.split("\n")
        for line in lines:
            segments = line.split("\t")
            if len(segments) < 2:
                continue
            line = segments[1]
            if line.strip() == "":
                continue
            tokens = set(token.lower() for token in tokenize(clean_text(line)))
            vocab.update(tokens)
    logger.info("total size of vocab: " + str(len(vocab)))
    vocab_dict = vocab_map(vocab)
    del vocab
    with open(args.output, 'wb') as f: