test_queries = extractTopics(config["test_queries"]) if config["test_queries_format"] == "trec" \ else extract_trec_million_queries(config["test_queries"]) queries = {**train_queries, **test_queries} print("{n} queries to process.".format(n=len(queries))) queries_text = {} q_times = defaultdict(int) print("Pre-process queries ...") for q in tqdm(queries): q_text = clean(queries[q], "krovetz", {}) q_times[q_text] += 1 queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join([q_text, str(q_times[q_text])]) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') qrels = get_qrels(config["relevance_judgements"]) print("Qrels : ", list(qrels.keys())[0:10]) print("Extraction of contextual content ...") for fold in tqdm(os.listdir(config["split_query_folders"])): # fold_0 ... fold_5 save = join(config["output_folder"], fold) if not os.path.exists(save): os.mkdir(save) for phase in os.listdir(join(config["split_query_folders"], fold)): # train, test, valdi corpus_file = open(join(save, "corpus_"+phase.split(".")[0].replace('_', '')+".txt"), 'w') phase_queries = open(join(config["split_query_folders"], join(fold, phase))).read().split('\n') # queries list for q_id in phase_queries: out_t.write(q_id+' '+queries_text[q_id]+'\n') # write the trec corpus passages = open(join(config["retrieved_passages"], q_id)).readlines()[:config["top_k"]] # top k passages # get passages text: unique_documents = {}
print("Preprocess queries ...") for q in tqdm(queries): q_text = clean(queries[q], "krovetz", {}) q_times[q_text] += 1 queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join( [q_text, str(q_times[q_text])]) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') print("Collection2Text ...") nl = 0 relations = [] if config["from_qrels"]: qrels = get_qrels(config["relevance_judgements"] ) # qrels[(q, doc)] = rel q:str, rel:int ranked_documents = set([e[1] for e in qrels]) if bool(config["rerank_run"]): ranked_documents = ranked_documents.union( get_docs_from_run(config["rerank_run"])) print("totalling: %d documents" % len(ranked_documents)) nl = save_corpus(queries_text, ranked_documents, index, id2token, externalDocId, out_t) logging.info("Corpus file saved to " + out_trec_f + " with " + str(nl) + " lines") relations = [((e[0], e[1]), qrels[e]) for e in qrels] # same content (q, doc):rel, q:int logging.info('From relevance judgements : ' + config["relevance_judgements"])
print("Reading index ...") index = pyndri.Index(config["indexed_data"]) _, id2token, _ = index.get_dictionary() externelDocId = {} for doc in range(index.document_base(), index.maximum_document()): extD, _ = index.document(doc) externelDocId[extD] = doc queries = extractTopics(config["queries"]) queries_text = {} for q in queries: queries_text[q] = clean(queries[q], "krovetz", {}) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') qrels = get_qrels( config["relevance_judgements"] ) # dictionary: qrels[(q,doc)] = rel with q and rel are ints print("Collection2Text ...") nl = 0 relations = [] if bool(config["relevance_judgements"]) and not bool( config["run_file"]) and not bool(config["runs_folder"]): ranked_documents = set([e[1] for e in qrels]) print("totalling: %d documents" % len(ranked_documents)) nl = save_corpus(queries_text, ranked_documents, index, id2token, externelDocId, out_t) logging.info("Corpus file saved to " + out_trec_f + " with " + str(nl) + " lines") relations = [(e, qrels[e]) for e in qrels] # same content
if __name__ == '__main__': args = docopt.docopt(""" Usage: relation2qrels.py --r=<relation_file> --q=<qrels_file> --o=<output_folder> Example: relation2qrels.py --r=<relation_file> --q=<qrels_file> Options: --r=<relation_file> Relation file of MatchZoo. --q=<qrels_file> Trec like qrels file. --o=<output_folder> Where constructed file whil be stored. """) print("Qrels extraction ...") qrels = get_qrels(args["--q"]) relations = read_relations(args["--r"]) qrels_relations = set() out = join(args["--o"], "qrels.mz") with open(out, 'w') as f: for q in tqdm(relations): for d in relations[q]: try: r = qrels[(int(q), d)] except: r = 0 qrels_relations.add((q, d, r)) for r in qrels_relations: f.write("{q} 0 {d} {r}\n".format(q=r[0], d=r[1], r=r[2])) print("Finished.")
out_f = join(args["--o"], "corpus.txt") out_r = join(args["--o"], "relation.txt") out_q = join(args["--o"], args["--d"] + "qrels.txt") out = codecs.open(out_f, "w", encoding='utf8') out2 = codecs.open(out_r, "w", encoding='utf8') out3 = codecs.open(out_q, "w", encoding='utf8') nl = 0 nl2 = 0 # construct relation.txt from input data: if bool(args["--r"]) and not bool(args["--rank"]) and not bool( args["--ranklist"]): print("From Qrels file: " + args["--r"]) qrels = get_qrels(args["--r"]) ranked_documents = set([e[1] for e in qrels]) nl = save_corpus(queries_text, ranked_documents, index, id2token, externelDocId, out) """ print("Append qrels ...") for q,d in tqdm(itertools.product(list(queries_text.keys()),list(externelDocId.keys()))): if (q,d) not in qrels: qrels[(q,d)] = '0' """ print("Construct relation.txt ...") for c in tqdm(collections.OrderedDict(sorted(qrels.items()))): #out3.write("{q} 0 {d} {r}\n".format(q=c[0], d=c[1], r=qrels[c])) out2.write("{r} {q} {d}\n".format(r=qrels[c], q=c[0], d=c[1])) nl2 += 1
Options: relevance=<relevance_judgments> Give the relevance judgments files . sets=<query_sets> Give the queries sets folder . --n=<files_name_start> String with whom all names of the different sets start with . --o=<output_folder> Where results should be stored . """ print(json.dumps(config, indent=2)) # get relevance judgments print("Relevance judgments ...") judgments = {} labels = set() if os.path.isfile(config["relevance"]): judgments, labels = get_qrels(config["relevance"]) elif os.path.isdir(config["relevance"]): for file in os.listdir(config["relevance"]): judgment_file, new_labels = get_qrels( os.path.join(config["relevance"], file)) judgments.update(judgment_file) labels = labels | new_labels print("Judged: ", len(judgments), list(judgments.keys())[:10], labels) # print(judgments) # get queries sets: print("Queries sets ...") sets = {} for file in os.listdir(config["sets"]): if os.path.isfile(os.path.join(config["sets"], file)): if file.startswith(config["name"]):
queries = extractTopics(config["queries"]) queries_text = {} q_times = defaultdict(int) for q in queries: q_text = clean(queries[q], "krovetz", {}) q_times[q_text] += 1 queries_text[q] = q_text if q_times[q_text] == 1 else ' '.join( [q_text, str(q_times[q_text])]) out_trec_f = join(config["output_folder"], "trec_corpus.txt") out_t = codecs.open(out_trec_f, "w", encoding='utf8') qrels = {} if bool(config["relevance_judgements"]): qrels = get_qrels( config["relevance_judgements"] ) # dictionary: "qrels[(q,doc)]:rel" with q and rel are ints print("Collection2Text ...") nl = 0 relations = [] logging.info("From a set of runs in " + config["runs_folder"]) ranked_documents = set() for f in os.listdir(config["runs_folder"]): ranked_documents = ranked_documents.union( get_docs_from_run(join(config["runs_folder"], f))) if bool(config["relevance_judgements"]): ranked_documents = ranked_documents.union( set([e[1] for e in get_qrels(config["relevance_judgements"])])) if bool(config["run_file"]):