Python TrecRunの例、pyserini.trectools.TrecRun Pythonの例

コード例 #1

0

ファイルを表示

    def test_simple_fusion_searcher(self):
        index_dirs = [
            'lucene-index-cord19-abstract-2020-05-01/',
            'lucene-index-cord19-full-text-2020-05-01/',
            'lucene-index-cord19-paragraph-2020-05-01/'
        ]

        searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF)

        runs, topics = [], get_topics('covid_round2')
        for topic in tqdm(sorted(topics.keys())):
            query = topics[topic]['question'] + ' ' + topics[topic]['query']
            hits = searcher.search(query,
                                   k=10000,
                                   query_generator=None,
                                   strip_segment_id=True,
                                   remove_dups=True)
            docid_score_pair = [(hit.docid, hit.score) for hit in hits]
            run = TrecRun.from_search_results(docid_score_pair, topic=topic)
            runs.append(run)

        all_topics_run = TrecRun.concat(runs)
        all_topics_run.save_to_txt(output_path='fused.txt',
                                   tag='reciprocal_rank_fusion_k=60')

        # Only keep topic, docid and rank. Scores have different floating point precisions.
        os.system("""awk '{print $1" "$3" "$4}' fused.txt > this.txt""")
        os.system(
            """awk '{print $1" "$3" "$4}' anserini.covid-r2.fusion1.txt > that.txt"""
        )

        self.assertTrue(filecmp.cmp('this.txt', 'that.txt'))

コード例 #2

0

ファイルを表示

    def test_simple_fusion_searcher(self):
        index_dirs = ['indexes/lucene-index-cord19-abstract-2020-05-01/',
                      'indexes/lucene-index-cord19-full-text-2020-05-01/',
                      'indexes/lucene-index-cord19-paragraph-2020-05-01/']

        searcher = SimpleFusionSearcher(index_dirs, method=FusionMethod.RRF)

        runs, topics = [], get_topics('covid-round2')
        for topic in tqdm(sorted(topics.keys())):
            query = topics[topic]['question'] + ' ' + topics[topic]['query']
            hits = searcher.search(query, k=10000, query_generator=None, strip_segment_id=True, remove_dups=True)
            docid_score_pair = [(hit.docid, hit.score) for hit in hits]
            run = TrecRun.from_search_results(docid_score_pair, topic=topic)
            runs.append(run)

        all_topics_run = TrecRun.concat(runs)
        all_topics_run.save_to_txt(output_path='runs/fused.txt', tag='reciprocal_rank_fusion_k=60')

        # Only keep topic, docid, and rank. Scores may be slightly different due to floating point precision issues and underlying lib versions.
        # TODO: We should probably do this in Python as opposed to calling out to shell for better portability.
        # This has also proven to be a somewhat brittle test, see https://github.com/castorini/pyserini/issues/947
        # A stopgap for above issue, we're restricting comparison to only top-100 ranks.
        os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/fused.txt > runs/this.txt""")
        os.system("""awk '$4 <= 100 {print $1" "$3" "$4}' runs/anserini.covid-r2.fusion1.txt > runs/that.txt""")

        self.assertTrue(filecmp.cmp('runs/this.txt', 'runs/that.txt'))

コード例 #3

0

ファイルを表示

    def test_trec_run_read(self):
        input_path = 'tests/resources/simple_trec_run_read.txt'
        verify_path = 'tests/resources/simple_trec_run_read_verify.txt'

        run = TrecRun(filepath=input_path)
        run.save_to_txt(self.output_path)
        self.assertTrue(filecmp.cmp(verify_path, self.output_path))

コード例 #4

0

ファイルを表示

def bm25(qid, query, docs, index_path):
    s = SimpleSearcher(index_path)
    hits = s.search(query, 1000)

    n = 1
    seen_docids = {}
    with open(f'run-passage-{qid}.txt', 'w') as writer:
        for i in range(0, len(hits)):
            if hits[i].docid in seen_docids:
                continue
            writer.write(f'{qid} Q0 {hits[i].docid} {n} {hits[i].score:.5f} pyserini\n')
            n = n + 1
            seen_docids[hits[i].docid] = 1

    with open(f'run-doc-{qid}.txt', 'w') as writer:
        for doc in docs:
            writer.write(f'{qid} Q0 {doc["docid"]} {doc["rank"]} {doc["score"]} base\n')
            n = n + 1

    os.system(f'python -m pyserini.fusion --method rrf --runs run-passage-{qid}.txt run-doc-{qid}.txt ' +
              f'--output run-rrf-{qid}.txt --runtag test')
    fused_run = TrecRun(f'run-rrf-{qid}.txt')

    output = []
    for idx, r in fused_run.get_docs_by_topic(qid).iterrows():
        output.append([qid, r["docid"], r["rank"]])

    return output

コード例 #5

0

ファイルを表示

ファイル: rerank_with_maxp.py プロジェクト: yuki617/pyserini

def main(args):
    if args.cache and not os.path.exists(args.cache):
        os.mkdir(args.cache)

    # Load queries:
    queries = load_queries(args.queries)
    # Load base run to rerank:
    base_run = TrecRun(args.input)

    # SimpleSearcher to fetch document texts.
    searcher = SimpleSearcher.from_prebuilt_index('msmarco-doc')

    output = []

    if args.bm25:
        reranker = 'bm25'
    elif args.ance:
        reranker = 'ance'
    elif not args.identity:
        sys.exit('Unknown reranking method!')

    cnt = 1
    for row in queries:
        qid = int(row[0])
        query = row[1]
        print(f'{cnt} {qid} {query}')
        qid_results = base_run.get_docs_by_topic(qid)

        # Don't actually do reranking, just pass along the base run:
        if args.identity:
            rank = 1
            for docid in qid_results['docid'].tolist():
                output.append([qid, docid, rank])
                rank = rank + 1
            cnt = cnt + 1
            continue

        # Gather results for reranking:
        results_to_rerank = []
        for index, result in qid_results.iterrows():
            raw_doc = searcher.doc(
                result['docid']).raw().lstrip('<TEXT>').rstrip('</TEXT>')
            results_to_rerank.append({
                'docid': result['docid'],
                'rank': result['rank'],
                'score': result['score'],
                'text': raw_doc
            })

        # Perform the actual reranking:
        output.extend(
            rerank(args.cache, qid, query, results_to_rerank, reranker))
        cnt = cnt + 1

    # Write the output run file:
    with open(args.output, 'w') as writer:
        for r in output:
            writer.write(f'{r[0]}\t{r[1]}\t{r[2]}\n')

コード例 #6

0

ファイルを表示

    def test_retain_qrels(self):
        run = TrecRun('tests/resources/simple_trec_run_filter.txt')
        qrels = Qrels('tools/topics-and-qrels/qrels.covid-round1.txt')

        run.retain_qrels(qrels,
                         clone=True).save_to_txt(output_path=self.output_path)
        self.assertTrue(
            filecmp.cmp('tests/resources/simple_trec_run_keep_verify.txt',
                        self.output_path))

コード例 #7

0

ファイルを表示

ファイル: test_trectools.py プロジェクト: yuki617/pyserini

    def test_trec_run_topics(self):
        input_path = os.path.join(
            self.root, 'tests/resources/simple_trec_run_msmarco_doc1.txt')

        run = TrecRun(filepath=input_path)
        self.assertEqual(run.topics(), {320792, 174249, 1090270, 1101279})

        for topic in run.topics():
            self.assertEqual(len(run.get_docs_by_topic(topic)), 5)

コード例 #8

0

ファイルを表示

ファイル: test_trectools.py プロジェクト: yuki617/pyserini

 def test_normalize_scores(self):
     run = TrecRun(
         os.path.join(self.root,
                      'tests/resources/simple_trec_run_fusion_1.txt'))
     run.rescore(RescoreMethod.NORMALIZE).save_to_txt(self.output_path)
     self.assertTrue(
         filecmp.cmp(
             os.path.join(
                 self.root,
                 'tests/resources/simple_trec_run_normalize_verify.txt'),
             self.output_path))

コード例 #9

0

ファイルを表示

ファイル: _searcher.py プロジェクト: MXueguang/pyserini

    def search(self,
               q: Union[str, JQuery],
               k: int = 10,
               query_generator: JQueryGenerator = None,
               strip_segment_id=False,
               remove_dups=False) -> List[JLuceneSearcherResult]:
        trec_runs, docid_to_search_result = list(), dict()

        for searcher in self.searchers:
            docid_score_pair = list()
            hits = searcher.search(q,
                                   k=k,
                                   query_generator=query_generator,
                                   strip_segment_id=strip_segment_id,
                                   remove_dups=remove_dups)

            for hit in hits:
                docid_to_search_result[hit.docid] = hit
                docid_score_pair.append((hit.docid, hit.score))

            run = TrecRun.from_search_results(docid_score_pair)
            trec_runs.append(run)

        if self.method == FusionMethod.RRF:
            fused_run = reciprocal_rank_fusion(trec_runs,
                                               rrf_k=60,
                                               depth=1000,
                                               k=1000)
        else:
            raise NotImplementedError()

        return self.convert_to_search_result(fused_run, docid_to_search_result)

コード例 #10

0

ファイルを表示

def reciprocal_rank_fusion(runs: List[TrecRun],
                           rrf_k: int = 60,
                           depth: int = None,
                           k: int = None):
    """Perform reciprocal rank fusion on a list of ``TrecRun`` objects. Implementation follows Cormack et al.
    (SIGIR 2009) paper titled "Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods."

    Parameters
    ----------
    runs : List[TrecRun]
        List of ``TrecRun`` objects.
    rrf_k : int
        Parameter to avoid vanishing importance of lower-ranked documents. Note that this is different from the *k* in
        top *k* retrieval; set to 60 by default, per Cormack et al.
    depth : int
        Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
        the complete list of results is considered.
    k : int
        Length of final results list.  Set to ``None`` by default, which indicates that the union of all input documents
        are ranked.

    Returns
    -------
    TrecRun
        Output ``TrecRun`` that combines input runs via reciprocal rank fusion.
    """

    # TODO: Add option to *not* clone runs, thus making the method destructive, but also more efficient.
    rrf_runs = [
        run.clone().rescore(method=RescoreMethod.RRF, rrf_k=rrf_k)
        for run in runs
    ]
    return TrecRun.merge(rrf_runs, AggregationMethod.SUM, depth=depth, k=k)

コード例 #11

0

ファイルを表示

def average(runs: List[TrecRun], depth: int = None, k: int = None):
    """Perform fusion by averaging on a list of ``TrecRun`` objects.

    Parameters
    ----------
    runs : List[TrecRun]
        List of ``TrecRun`` objects.
    depth : int
        Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
        the complete list of results is considered.
    k : int
        Length of final results list.  Set to ``None`` by default, which indicates that the union of all input documents
        are ranked.

    Returns
    -------
    TrecRun
        Output ``TrecRun`` that combines input runs via averaging.
    """

    scaled_runs = [
        run.clone().rescore(method=RescoreMethod.SCALE, scale=(1 / len(runs)))
        for run in runs
    ]
    return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)

コード例 #12

0

ファイルを表示

ファイル: test_trectools.py プロジェクト: yuki617/pyserini

    def test_discard_qrels(self):
        run = TrecRun(
            os.path.join(self.root,
                         'tests/resources/simple_trec_run_filter.txt'))
        qrels = Qrels(
            os.path.join(self.root,
                         'tools/topics-and-qrels/qrels.covid-round1.txt'))

        run.discard_qrels(
            qrels, clone=False).save_to_txt(output_path=self.output_path)
        self.assertTrue(
            filecmp.cmp(
                os.path.join(
                    self.root,
                    'tests/resources/simple_trec_run_remove_verify.txt'),
                self.output_path))

コード例 #13

0

ファイルを表示

    def convert_to_search_result(run: TrecRun, docid_to_search_result: Dict[str, JSimpleSearcherResult]) -> List[JSimpleSearcherResult]:
        search_results = []

        for _, _, docid, _, score, _ in run.to_numpy():
            search_result = docid_to_search_result[docid]
            search_result.score = score
            search_results.append(search_result)

        return search_results

コード例 #14

0

ファイルを表示

def interpolation(runs: List[TrecRun],
                  alpha: int = 0.5,
                  depth: int = None,
                  k: int = None):
    """Perform fusion by interpolation on a list of exactly two ``TrecRun`` objects.
    new_score = first_run_score * alpha + (1 - alpha) * second_run_score.

    Parameters
    ----------
    runs : List[TrecRun]
        List of ``TrecRun`` objects. Exactly two runs.
    alpha : int
        Parameter alpha will be applied on the first run and (1 - alpha) will be applied on the second run.
    depth : int
        Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
        the complete list of results is considered.
    k : int
        Length of final results list.  Set to ``None`` by default, which indicates that the union of all input documents
        are ranked.

    Returns
    -------
    TrecRun
        Output ``TrecRun`` that combines input runs via interpolation.
    """

    if len(runs) != 2:
        raise Exception('Interpolation must be performed on exactly two runs.')

    scaled_runs = []
    scaled_runs.append(runs[0].clone().rescore(method=RescoreMethod.SCALE,
                                               scale=alpha))
    scaled_runs.append(runs[1].clone().rescore(method=RescoreMethod.SCALE,
                                               scale=(1 - alpha)))

    return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)