Esempio n. 1
0
    def test_trec_run_read(self):
        input_path = 'tests/resources/simple_trec_run_read.txt'
        verify_path = 'tests/resources/simple_trec_run_read_verify.txt'

        run = TrecRun(filepath=input_path)
        run.save_to_txt(self.output_path)
        self.assertTrue(filecmp.cmp(verify_path, self.output_path))
Esempio n. 2
0
def bm25(qid, query, docs, index_path):
    s = SimpleSearcher(index_path)
    hits = s.search(query, 1000)

    n = 1
    seen_docids = {}
    with open(f'run-passage-{qid}.txt', 'w') as writer:
        for i in range(0, len(hits)):
            if hits[i].docid in seen_docids:
                continue
            writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n')
            n = n + 1
            seen_docids[hits[i].docid] = 1

    with open(f'run-doc-{qid}.txt', 'w') as writer:
        for doc in docs:
            writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n')
            n = n + 1

    os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' +
              f'--output run-rrf-{qid}.txt --runtag test')
    fused_run = TrecRun(f'run-rrf-{qid}.txt')

    output = []
    for idx, r in fused_run.get_docs_by_topic(qid).iterrows():
        output.append([qid, r["docid"], r["rank"]])

    return output
Esempio n. 3
0
def main(args):
    if args.cache and not os.path.exists(args.cache):
        os.mkdir(args.cache)

    # Load queries:
    queries = load_queries(args.queries)
    # Load base run to rerank:
    base_run = TrecRun(args.input)

    # SimpleSearcher to fetch document texts.
    searcher = SimpleSearcher.from_prebuilt_index('msmarco-doc')

    output = []

    if args.bm25:
        reranker = 'bm25'
    elif args.ance:
        reranker = 'ance'
    elif not args.identity:
        sys.exit('Unknown reranking method!')

    cnt = 1
    for row in queries:
        qid = int(row[0])
        query = row[1]
        print(f'{cnt} {qid} {query}')
        qid_results = base_run.get_docs_by_topic(qid)

        # Don't actually do reranking, just pass along the base run:
        if args.identity:
            rank = 1
            for docid in qid_results['docid'].tolist():
                output.append([qid, docid, rank])
                rank = rank + 1
            cnt = cnt + 1
            continue

        # Gather results for reranking:
        results_to_rerank = []
        for index, result in qid_results.iterrows():
            raw_doc = searcher.doc(
                result['docid']).raw().lstrip('<TEXT>').rstrip('</TEXT>')
            results_to_rerank.append({
                'docid': result['docid'],
                'rank': result['rank'],
                'score': result['score'],
                'text': raw_doc
            })

        # Perform the actual reranking:
        output.extend(
            rerank(args.cache, qid, query, results_to_rerank, reranker))
        cnt = cnt + 1

    # Write the output run file:
    with open(args.output, 'w') as writer:
        for r in output:
            writer.write(f'{r[0]}\t{r[1]}\t{r[2]}\n')
Esempio n. 4
0
    def test_retain_qrels(self):
        run = TrecRun('tests/resources/simple_trec_run_filter.txt')
        qrels = Qrels('tools/topics-and-qrels/qrels.covid-round1.txt')

        run.retain_qrels(qrels,
                         clone=True).save_to_txt(output_path=self.output_path)
        self.assertTrue(
            filecmp.cmp('tests/resources/simple_trec_run_keep_verify.txt',
                        self.output_path))
Esempio n. 5
0
    def test_trec_run_topics(self):
        input_path = os.path.join(
            self.root, 'tests/resources/simple_trec_run_msmarco_doc1.txt')

        run = TrecRun(filepath=input_path)
        self.assertEqual(run.topics(), {320792, 174249, 1090270, 1101279})

        for topic in run.topics():
            self.assertEqual(len(run.get_docs_by_topic(topic)), 5)
Esempio n. 6
0
 def test_normalize_scores(self):
     run = TrecRun(
         os.path.join(self.root,
                      'tests/resources/simple_trec_run_fusion_1.txt'))
     run.rescore(RescoreMethod.NORMALIZE).save_to_txt(self.output_path)
     self.assertTrue(
         filecmp.cmp(
             os.path.join(
                 self.root,
                 'tests/resources/simple_trec_run_normalize_verify.txt'),
             self.output_path))
Esempio n. 7
0
    def test_discard_qrels(self):
        run = TrecRun(
            os.path.join(self.root,
                         'tests/resources/simple_trec_run_filter.txt'))
        qrels = Qrels(
            os.path.join(self.root,
                         'tools/topics-and-qrels/qrels.covid-round1.txt'))

        run.discard_qrels(
            qrels, clone=False).save_to_txt(output_path=self.output_path)
        self.assertTrue(
            filecmp.cmp(
                os.path.join(
                    self.root,
                    'tests/resources/simple_trec_run_remove_verify.txt'),
                self.output_path))