Example #1
0
def generate_pairs_from_qrels(qrel_file, topk_rank=20, num_random=5000):
    qrels = TrecQrel(qrel_file)
    for topic in tqdm(qrels.topics()):
        qrels_for_topic = [d.to_dict() for _, d in qrels.qrels_data[qrels.qrels_data['query'] == topic].iterrows()]
        for i in range(0, len(qrels_for_topic)):
            for j in range(0, len(qrels_for_topic)):
                if qrels_for_topic[i]['rel'] > qrels_for_topic[j]['rel']:
                    yield __generate_single_qrel_pair(qrels_for_topic, qrels_for_topic[i], qrels_for_topic[j])
Example #2
0
from trectools import TrecQrel
from elasticsearch import Elasticsearch, NotFoundError
import os
import codecs

es = Elasticsearch(["40.68.209.241:9200"])
qrels = TrecQrel("./data/clef-dynamic-topic-subset-click-data.txt")

nmissing = 0
npages = 0

for topic in qrels.topics():
    if not os.path.exists(os.path.join("data", "topic%d" % (topic))):
        os.makedirs(os.path.join("data", "topic%d" % (topic)))
        os.makedirs(os.path.join("data", "topic%d" % (topic), "pos"))
        os.makedirs(os.path.join("data", "topic%d" % (topic), "neg"))

    for docid in qrels.get_document_names_for_topic(topic):
        npages += 1

        print "Downloading %s" % (docid)
        try:
            r = es.get(index="clueweb12_docs", id=docid)
        except NotFoundError:
            print "Missing: %s" % (docid)
            nmissing += 1
            continue

        if qrels.get_judgement(docid, topic):
            outpath = os.path.join("data", "topic%d" % (topic), "pos", docid)
        else: