def test_get_collection_from_index_path():
    index_path_1 = "robust04/something/anserini/foo"
    index_path_2 = "/foo/bar"

    collection = Collection.get_collection_from_index_path(index_path_1)
    assert collection.name == "robust04"

    collection = Collection.get_collection_from_index_path(index_path_2)
    assert collection is None
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    # Kind of a useless test - not asserting much here. Still useful since it makes sure that the code at least runs
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
        "passagelen": 3,
        "slicelen": 20,
        "tfchannel": True,
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    extractor = DeepTileExtractor(tmpdir,
                                  tmpdir,
                                  pipeline_config,
                                  index=trec_index,
                                  collection=collection,
                                  benchmark=benchmark)

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(extractor, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    extractor.build_from_benchmark(True)
    assert extractor.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.itos[1] = "dummy"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
    }

    transformed = feature.transform_qid_posdocid_negdocid("301", "LA010189-0001", "LA010189-0001")

    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert numpy.array_equal(transformed["query"], [1, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["posdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["negdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0, 0, 0])
def test_transform_qid_posdocid_negdocid_only_posdoc(tmpdir, trec_index,
                                                     dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.stoi["doc"] = 2
    feature.itos[1] = "dummy"
    feature.itos[2] = "doc"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] is None

    # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc"
    assert numpy.array_equal(transformed["query"], [0, 1, 1])
    assert numpy.array_equal(transformed["posdoc"], [
        6, 3, 0
    ])  # There  are 6 unknown words in the doc, so all of them is encoded as 0
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0])

    # Learn another word
    feature.stoi["hello"] = 3
    feature.itos[3] = "hello"
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001")
    # The posdoc transformation changes to reflect the new word
    assert numpy.array_equal(transformed["posdoc"], [5, 3, 0, 1])
Beispiel #5
0
    def do_query(config, query_string, pipeline, index, tokenizer, embedding_holder, model_class, trained_weight_path=None):
        """
        1. Do a bm25 search to get top 100 results. This is an optimization technique - we don't want to feed the entire
        dataset to the NIR model's forward pass
        2. Based on the documents retrieved above, create an embedding layer to be used in the NIR model
        3. Instantiate an NIR model, load the trained weights, and do a forward pass with the 1000 docs
        4. Return the document with the highest score
        """
        # 1. Do bm25 search and tokenize the results
        # TODO: Handle the case where bm25 search returns no results
        api_start = time.time()
        doc_ids, docs = BM25View.do_query(query_string, index)

        all_tokens = NeuralQueryView.get_tokens_from_docs_and_query(tokenizer, docs, query_string)

        # 2. Form an embedding layer and doc and query features
        embeddings = torch.from_numpy(embedding_holder.create_indexed_embedding_layer_from_tokens(all_tokens)).to(pipeline.device)
        doc_features = NeuralQueryView.create_tensor_from_docs(docs, tokenizer, embedding_holder, config["maxdoclen"]).to(
            pipeline.device
        )
        query_features, query_idf_features = NeuralQueryView.create_tensor_from_query_string(
            query_string, index, tokenizer, embedding_holder, len(doc_features), config["maxqlen"]
        )
        query_features = query_features.to(pipeline.device)
        query_idf_features = query_idf_features.to(pipeline.device)

        # 3. Do a forward pass of the NIR model class, and get the max scoring document
        # TODO: Remove the dependence of NIR reranker on pipeline. Pass the configs explicitly
        model_instance = model_class(embeddings, None, config)
        model_instance.build()
        model_instance.to(pipeline.device)
        # model_instance = model_class.alternate_init(pipeline, embeddings, config).to(pipeline.device)
        if trained_weight_path is not None:
            model_instance.load(trained_weight_path)
        scores = model_instance.test(query_features, query_idf_features, doc_features)
        max_scoring_doc_ids = [doc_ids[i] for i in reversed(torch.argsort(scores))]
        max_scoring_doc_ids = max_scoring_doc_ids[:NUM_RESULTS_TO_SHOW]
        # TODO: Get this in a single go, instead of calling `getdoc()` repeatedly
        docs = [index.getdoc(doc_id) for doc_id in max_scoring_doc_ids]
        # Trimming to first 250 chars. TODO: Make this configurable
        docs = [doc[:250] for doc in docs]

        relevance_fetch_start = time.time()
        collection = Collection.get_collection_from_index_path(index.index_path)
        relevances = collection.get_relevance(query_string, max_scoring_doc_ids) if collection is not None else [0] * len(doc_ids)
        result_dicts = NeuralQueryView.construct_result_dicts(max_scoring_doc_ids, docs, relevances)
        relevance_fetch_stop = time.time()

        api_stop = time.time()
        logger.debug("Took {0} seconds to get the most relevant doc".format(api_stop - api_start))
        logger.debug(
            "Determining the relevance of the fetched document took {0} seconds".format(
                relevance_fetch_stop - relevance_fetch_start
            )
        )
        return result_dicts
Beispiel #6
0
    def get(self, request, *args, **kwargs):
        query = request.GET.dict()["q"]
        target_index = request.GET.dict()["target_index"]
        collection = Collection.get_collection_from_index_path(target_index)

        if collection is None:
            return JsonResponse([], safe=False)
        else:
            suggestions = collection.get_query_suggestions(query)
            return JsonResponse(suggestions, safe=False)
def trec_index(request, tmpdir):
    """
    Build an index based on sample data and create an AnseriniIndex instance based on it
    """
    indir = os.path.join(COLLECTIONS["dummy"].basepath, "dummy")
    outdir = os.path.join(tmpdir, "index")
    anserini_fat_jar = Anserini.get_fat_jar()
    cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=IndexCollection io.anserini.index.IndexCollection  -collection TrecCollection -generator JsoupGenerator -threads 1 -input {indir} -index {outdir} -storeTransformedDocs"
    os.system(cmd)
    collection = Collection(dummy_collection_config())
    anserini_index = AnseriniIndex(collection, outdir,
                                   os.path.join(tmpdir, "index_cache"))
    anserini_index.open()
    return anserini_index
Beispiel #8
0
def test_cross_validated_ranking(trec_index, dummy_collection_config, tmpdir):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2
    }

    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    test_ranking = bm25_run.crossvalidated_ranking(["301"], ["301"])

    assert test_ranking["301"]["LA010189-0001"] > 0
    assert test_ranking["301"]["LA010189-0002"] > 0
Beispiel #9
0
def test_bm25grid_create(trec_index, dummy_collection_config, tmpdir):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2
    }

    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()

    # Make sure that the searcher file is generated
    os.path.isfile(os.path.join(tmpdir, "searcher", "done"))
Beispiel #10
0
    def get_bm25_results(query_string, target_index, b, k1):
        index_class = Index.get_index_from_index_path(target_index)
        index = index_class(target_index)

        bm25_kwargs = {"n": NUM_RESULTS_TO_SHOW}
        if b is not None:
            bm25_kwargs["b"] = b
        if k1 is not None:
            bm25_kwargs["k1"] = k1

        doc_ids, docs = BM25View.do_query(query_string, index, **bm25_kwargs)
        docs = [doc[:250] for doc in docs]
        collection = Collection.get_collection_from_index_path(index.index_path)
        relevances = collection.get_relevance(query_string, doc_ids) if collection is not None else [0] * len(doc_ids)
        result_dicts = NeuralQueryView.construct_result_dicts(doc_ids, docs, relevances)
        return result_dicts
Beispiel #11
0
def test_anserini_large_collections(dummy_collection_config, tmpdir):
    # raise Exception("TODO: Fix the jnius issue")
    collection = Collection(dummy_collection_config)
    collection.is_large_collection = True
    index = AnseriniIndex(collection, tmpdir, tmpdir)
    config = {"indexstops": False, "stemmer": "anserini", "maxthreads": 1}

    # Deliberately not calling index.create()
    docs = index.get_docs(["LA010189-0001", "LA010189-0002"])
    assert len(docs) == 2
    assert docs == [
        "Dummy Dummy Dummy Hello world, greetings from outer space!",
        "Dummy Dummy Dummy Hello world, greetings from outer space!",
    ]

    collection.is_large_collection = False
    # Because we would be trying to read from an index that is not present
    with pytest.raises(Exception):
        docs = index.get_docs(["LA010189-0001", "LA010189-0002"])
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "reranker": "KNRM",
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {"s1": {"train_qids": ["301"], "predict": {"dev": ["301"], "test": ["301"]}}}
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    # Prevents a download when the unit tests are searcher
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding)

    feature.build_from_benchmark("glove6b", True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert numpy.array_equal(feature.embeddings[0], [0, 0, 0, 0, 0, 0, 0, 0])
    assert feature.embeddings.shape == (9, 8)
def test_build_from_benchmark_with_trigram(monkeypatch, tmpdir, trec_index,
                                           dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "trigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "#du": 1,
        "dum": 2,
        "umm": 3,
        "mmy": 4,
        "my#": 5,
        "#do": 6,
        "doc": 7,
        "oc#": 8,
        "#he": 9,
        "hel": 10,
        "ell": 11,
        "llo": 12,
        "lo#": 13,
        "#wo": 14,
        "wor": 15,
        "orl": 16,
        "rld": 17,
        "ld#": 18,
        "#gr": 19,
        "gre": 20,
        "ree": 21,
        "eet": 22,
        "eti": 23,
        "tin": 24,
        "ing": 25,
        "ngs": 26,
        "gs#": 27,
        "#fr": 28,
        "fro": 29,
        "rom": 30,
        "om#": 31,
        "#ou": 32,
        "out": 33,
        "ute": 34,
        "ter": 35,
        "er#": 36,
        "#sp": 37,
        "spa": 38,
        "pac": 39,
        "ace": 40,
        "ce#": 41,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert feature.embeddings == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }
Beispiel #15
0
def test_transform_qid_posdocid_negdocid(monkeypatch, tmpdir, trec_index,
                                         dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BertText(tmpdir,
                       tmpdir,
                       pipeline_config,
                       index=trec_index,
                       collection=collection,
                       benchmark=benchmark)
    feature.build_from_benchmark()
    transformed = feature.transform_qid_posdocid_negdocid(
        "301", "LA010189-0001", "LA010189-0001")

    assert np.array_equal(
        transformed["postoks"],
        [
            101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088,
            1010, 14806, 2015, 2013, 6058, 102
        ],
    )
    assert np.array_equal(
        transformed["posmask"],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(
        transformed["possegs"],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(transformed["posqmask"], [1, 1, 0, 0, 0])
    assert np.array_equal(transformed["posdmask"],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    assert np.array_equal(
        transformed["negtoks"],
        [
            101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088,
            1010, 14806, 2015, 2013, 6058, 102
        ],
    )
    assert np.array_equal(
        transformed["negmask"],
        [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(
        transformed["negsegs"],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
    assert np.array_equal(transformed["negqmask"], [1, 1, 0, 0, 0])
    assert np.array_equal(transformed["negdmask"],
                          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert transformed["qid"] == "301"