Esempio n. 1
0
        test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \
            else extract_trec_million_queries(config["test_queries"])
        queries = {**train_queries, **test_queries}
    print("{n} queries to process.".format(n=len(queries)))

    queries_text = {}
    q_times = defaultdict(int)
    print("Pre-process queries ...")
    for q in tqdm(queries):
        q_text = clean(queries[q], "krovetz", {})
        q_times[q_text] += 1
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join([q_text, str(q_times[q_text])])

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')
    qrels = get_qrels(config["relevance_judgements"])
    print("Qrels : ", list(qrels.keys())[0:10])

    print("Extraction of contextual content ...")
    for fold in tqdm(os.listdir(config["split_query_folders"])):  # fold_0 ... fold_5
        save = join(config["output_folder"], fold)
        if not os.path.exists(save):
            os.mkdir(save)
        for phase in os.listdir(join(config["split_query_folders"], fold)):  # train, test, valdi
            corpus_file = open(join(save, "corpus_"+phase.split(".")[0].replace('_', '')+".txt"), 'w')
            phase_queries = open(join(config["split_query_folders"], join(fold, phase))).read().split('\n')  # queries list
            for q_id in phase_queries:
                out_t.write(q_id+' '+queries_text[q_id]+'\n')  # write the trec corpus
                passages = open(join(config["retrieved_passages"], q_id)).readlines()[:config["top_k"]]  # top k passages
                # get passages text:
                unique_documents = {}
    print("Preprocess queries ...")
    for q in tqdm(queries):
        q_text = clean(queries[q], "krovetz", {})
        q_times[q_text] += 1
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
            [q_text, str(q_times[q_text])])

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')

    print("Collection2Text ...")
    nl = 0
    relations = []

    if config["from_qrels"]:
        qrels = get_qrels(config["relevance_judgements"]
                          )  # qrels[(q, doc)] = rel q:str, rel:int
        ranked_documents = set([e[1] for e in qrels])
        if bool(config["rerank_run"]):
            ranked_documents = ranked_documents.union(
                get_docs_from_run(config["rerank_run"]))
        print("totalling: %d documents" % len(ranked_documents))
        nl = save_corpus(queries_text, ranked_documents, index, id2token,
                         externalDocId, out_t)

        logging.info("Corpus file saved to " + out_trec_f + " with " +
                     str(nl) + " lines")

        relations = [((e[0], e[1]), qrels[e])
                     for e in qrels]  # same content (q, doc):rel, q:int
        logging.info('From relevance judgements : ' +
                     config["relevance_judgements"])
Esempio n. 3
0
    print("Reading index ...")
    index = pyndri.Index(config["indexed_data"])
    _, id2token, _ = index.get_dictionary()
    externelDocId = {}
    for doc in range(index.document_base(), index.maximum_document()):
        extD, _ = index.document(doc)
        externelDocId[extD] = doc
    queries = extractTopics(config["queries"])
    queries_text = {}
    for q in queries:
        queries_text[q] = clean(queries[q], "krovetz", {})

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')
    qrels = get_qrels(
        config["relevance_judgements"]
    )  # dictionary: qrels[(q,doc)] = rel with q and rel are ints

    print("Collection2Text ...")
    nl = 0
    relations = []
    if bool(config["relevance_judgements"]) and not bool(
            config["run_file"]) and not bool(config["runs_folder"]):
        ranked_documents = set([e[1] for e in qrels])
        print("totalling: %d documents" % len(ranked_documents))
        nl = save_corpus(queries_text, ranked_documents, index, id2token,
                         externelDocId, out_t)
        logging.info("Corpus file saved to " + out_trec_f + " with " +
                     str(nl) + " lines")

        relations = [(e, qrels[e]) for e in qrels]  # same content
Esempio n. 4
0
if __name__ == '__main__':
    args = docopt.docopt("""
		Usage:
		    relation2qrels.py --r=<relation_file> --q=<qrels_file> --o=<output_folder>

		Example:
		    relation2qrels.py --r=<relation_file> --q=<qrels_file> 

		Options:
		    --r=<relation_file>	Relation file of MatchZoo.
		    --q=<qrels_file>	Trec like qrels file.
		    --o=<output_folder>	Where constructed file whil be stored.

		""")

    print("Qrels extraction ...")
    qrels = get_qrels(args["--q"])
    relations = read_relations(args["--r"])
    qrels_relations = set()
    out = join(args["--o"], "qrels.mz")
    with open(out, 'w') as f:
        for q in tqdm(relations):
            for d in relations[q]:
                try:
                    r = qrels[(int(q), d)]
                except:
                    r = 0
                qrels_relations.add((q, d, r))
        for r in qrels_relations:
            f.write("{q} 0 {d} {r}\n".format(q=r[0], d=r[1], r=r[2]))
    print("Finished.")
Esempio n. 5
0
    out_f = join(args["--o"], "corpus.txt")
    out_r = join(args["--o"], "relation.txt")
    out_q = join(args["--o"], args["--d"] + "qrels.txt")
    out = codecs.open(out_f, "w", encoding='utf8')
    out2 = codecs.open(out_r, "w", encoding='utf8')
    out3 = codecs.open(out_q, "w", encoding='utf8')

    nl = 0
    nl2 = 0

    # construct relation.txt from input data:

    if bool(args["--r"]) and not bool(args["--rank"]) and not bool(
            args["--ranklist"]):
        print("From Qrels file: " + args["--r"])
        qrels = get_qrels(args["--r"])
        ranked_documents = set([e[1] for e in qrels])
        nl = save_corpus(queries_text, ranked_documents, index, id2token,
                         externelDocId, out)
        """
		print("Append qrels ...")
		for q,d in tqdm(itertools.product(list(queries_text.keys()),list(externelDocId.keys()))):
			if (q,d) not in qrels:
				qrels[(q,d)] = '0'
		"""
        print("Construct relation.txt ...")
        for c in tqdm(collections.OrderedDict(sorted(qrels.items()))):
            #out3.write("{q} 0 {d} {r}\n".format(q=c[0], d=c[1], r=qrels[c]))
            out2.write("{r} {q} {d}\n".format(r=qrels[c], q=c[0], d=c[1]))
            nl2 += 1
        Options:
            relevance=<relevance_judgments>    Give the relevance judgments files .
            sets=<query_sets>    Give the queries sets folder .
            --n=<files_name_start>    String with whom all names of the different sets start with .
            --o=<output_folder>    Where results should be stored .
            
        """

    print(json.dumps(config, indent=2))

    # get relevance judgments
    print("Relevance judgments ...")
    judgments = {}
    labels = set()
    if os.path.isfile(config["relevance"]):
        judgments, labels = get_qrels(config["relevance"])
    elif os.path.isdir(config["relevance"]):
        for file in os.listdir(config["relevance"]):
            judgment_file, new_labels = get_qrels(
                os.path.join(config["relevance"], file))
            judgments.update(judgment_file)
            labels = labels | new_labels
    print("Judged: ", len(judgments), list(judgments.keys())[:10], labels)
    # print(judgments)

    # get queries sets:
    print("Queries sets ...")
    sets = {}
    for file in os.listdir(config["sets"]):
        if os.path.isfile(os.path.join(config["sets"], file)):
            if file.startswith(config["name"]):
    queries = extractTopics(config["queries"])
    queries_text = {}
    q_times = defaultdict(int)
    for q in queries:
        q_text = clean(queries[q], "krovetz", {})
        q_times[q_text] += 1
        queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join(
            [q_text, str(q_times[q_text])])

    out_trec_f = join(config["output_folder"], "trec_corpus.txt")
    out_t = codecs.open(out_trec_f, "w", encoding='utf8')

    qrels = {}
    if bool(config["relevance_judgements"]):
        qrels = get_qrels(
            config["relevance_judgements"]
        )  # dictionary: "qrels[(q,doc)]:rel" with q and rel are ints

    print("Collection2Text ...")
    nl = 0
    relations = []
    logging.info("From a set of runs in " + config["runs_folder"])

    ranked_documents = set()
    for f in os.listdir(config["runs_folder"]):
        ranked_documents = ranked_documents.union(
            get_docs_from_run(join(config["runs_folder"], f)))
    if bool(config["relevance_judgements"]):
        ranked_documents = ranked_documents.union(
            set([e[1] for e in get_qrels(config["relevance_judgements"])]))
    if bool(config["run_file"]):