def generate_pairs_from_qrels(qrel_file, topk_rank=20, num_random=5000): qrels = TrecQrel(qrel_file) for topic in tqdm(qrels.topics()): qrels_for_topic = [d.to_dict() for _, d in qrels.qrels_data[qrels.qrels_data['query'] == topic].iterrows()] for i in range(0, len(qrels_for_topic)): for j in range(0, len(qrels_for_topic)): if qrels_for_topic[i]['rel'] > qrels_for_topic[j]['rel']: yield __generate_single_qrel_pair(qrels_for_topic, qrels_for_topic[i], qrels_for_topic[j])
from trectools import TrecQrel from elasticsearch import Elasticsearch, NotFoundError import os import codecs es = Elasticsearch(["40.68.209.241:9200"]) qrels = TrecQrel("./data/clef-dynamic-topic-subset-click-data.txt") nmissing = 0 npages = 0 for topic in qrels.topics(): if not os.path.exists(os.path.join("data", "topic%d" % (topic))): os.makedirs(os.path.join("data", "topic%d" % (topic))) os.makedirs(os.path.join("data", "topic%d" % (topic), "pos")) os.makedirs(os.path.join("data", "topic%d" % (topic), "neg")) for docid in qrels.get_document_names_for_topic(topic): npages += 1 print "Downloading %s" % (docid) try: r = es.get(index="clueweb12_docs", id=docid) except NotFoundError: print "Missing: %s" % (docid) nmissing += 1 continue if qrels.get_judgement(docid, topic): outpath = os.path.join("data", "topic%d" % (topic), "pos", docid) else: