Esempio n. 1
0
def test_tokenize_text_with_calculate_idf(dummy_collection_config, trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = EmbedText(tmpdir, tmpdir, {}, index=trec_index)
    feature.build_stoi(toks_list, True, True)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {"be": 1.791759469228055, "not": 1.791759469228055, "or": 1.791759469228055, "to": 1.791759469228055}
Esempio n. 2
0
def test_tokenize_text(trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = EmbedText(tmpdir, tmpdir, {}, index=trec_index)
    feature.build_stoi(toks_list, True, False)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {}
Esempio n. 3
0
def test_create_embedding_matrix(monkeypatch, tmpdir, trec_index):
    feature = EmbedText(tmpdir, tmpdir, {"reranker": "KNRM"}, index=trec_index)
    feature.stoi = {"<pad>": 0, "hello": 1, "world": 2}

    # Prevents a download when the unit tests are searcher
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding)
    matrix = feature.create_embedding_matrix("glove6b")

    # We cannot assert the entire matrix because since there are no downloaded embeddings, the embedding for a word
    # would be random each time we searcher the test
    assert matrix.shape == (3, 8)
    assert numpy.array_equal(matrix[0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
Esempio n. 4
0
def test_train_sampler(monkeypatch, tmpdir):
    benchmark = DummyBenchmark()
    extractor = EmbedText(
        {"tokenizer": {"keepstops": True}}, provide={"collection": benchmark.collection, "benchmark": benchmark}
    )
    training_judgments = benchmark.qrels.copy()
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(training_judgments, training_judgments, extractor)

    def mock_id2vec(*args, **kwargs):
        return {"query": np.array([1, 2, 3, 4]), "posdoc": np.array([1, 1, 1, 1]), "negdoc": np.array([2, 2, 2, 2])}

    monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec)
    dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
    for idx, batch in enumerate(dataloader):
        assert len(batch["query"]) == 32
        assert len(batch["posdoc"]) == 32
        assert len(batch["negdoc"]) == 32
        assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["query"][30], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1]))
        assert np.array_equal(batch["posdoc"][30], np.array([1, 1, 1, 1]))
        assert np.array_equal(batch["negdoc"][0], np.array([2, 2, 2, 2]))
        assert np.array_equal(batch["negdoc"][30], np.array([2, 2, 2, 2]))

        # Just making sure that the dataloader can do multiple iterations
        if idx > 3:
            break
Esempio n. 5
0
def test_pred_sampler(monkeypatch, tmpdir):
    benchmark = DummyBenchmark()
    extractor = EmbedText({"tokenizer": {
        "keepstops": True
    }},
                          provide={"collection": benchmark.collection})
    search_run = {"301": {"LA010189-0001": 50, "LA010189-0002": 100}}
    pred_dataset = PredSampler()
    pred_dataset.prepare(benchmark.qrels, search_run, extractor)

    def mock_id2vec(*args, **kwargs):
        return {
            "query": np.array([1, 2, 3, 4]),
            "posdoc": np.array([1, 1, 1, 1])
        }

    monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec)
    dataloader = torch.utils.data.DataLoader(pred_dataset, batch_size=2)
    for idx, batch in enumerate(dataloader):
        print(idx, batch)
        assert len(batch["query"]) == 2
        assert len(batch["posdoc"]) == 2
        assert batch.get("negdoc") is None
        assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["query"][1], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1]))
        assert np.array_equal(batch["posdoc"][1], np.array([1, 1, 1, 1]))
Esempio n. 6
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "reranker": "KNRM",
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {"s1": {"train_qids": ["301"], "predict": {"dev": ["301"], "test": ["301"]}}}
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    # Prevents a download when the unit tests are searcher
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding)

    feature.build_from_benchmark("glove6b", True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert numpy.array_equal(feature.embeddings[0], [0, 0, 0, 0, 0, 0, 0, 0])
    assert feature.embeddings.shape == (9, 8)
Esempio n. 7
0
def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index, dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
    }
    bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config)
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark)
    feature.stoi["dummy"] = 1
    feature.itos[1] = "dummy"
    feature.doc_id_to_doc_toks = {
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
        "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"],
    }

    transformed = feature.transform_qid_posdocid_negdocid("301", "LA010189-0001", "LA010189-0001")

    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert numpy.array_equal(transformed["query"], [1, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["posdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["negdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0])
    assert numpy.array_equal(transformed["query_idf"], [0, 0, 0, 0, 0])
Esempio n. 8
0
def test_embedtext_id2vec(monkeypatch):
    def fake_load_embeddings(self):
        vocab = [
            "<pad>", "lessdummy", "dummy", "doc", "hello", "greetings",
            "world", "from", "outer", "space"
        ]
        self.embeddings = np.random.random((len(vocab), 50))
        self.embeddings[0, :] = 0
        self.stoi = {term: idx for idx, term in enumerate(vocab)}
        self.itos = {v: k for k, v in self.stoi.items()}

    monkeypatch.setattr(EmbedText, "_load_pretrained_embeddings",
                        fake_load_embeddings)

    benchmark = DummyBenchmark()
    extractor_cfg = {
        "name": "embedtext",
        "embeddings": "glove6b",
        "calcidf": True,
        "maxqlen": MAXQLEN,
        "maxdoclen": MAXDOCLEN
    }
    extractor = EmbedText(extractor_cfg,
                          provide={
                              "collection": DummyCollection(),
                              "benchmark": benchmark
                          })

    qids = list(benchmark.qrels.keys())  # ["301"]
    qid = qids[0]
    docids = list(benchmark.qrels[qid].keys())

    extractor.preprocess(qids, docids, benchmark.topics[benchmark.query_type])

    docid1, docid2 = docids[0], docids[1]
    data = extractor.id2vec(qid, docid1, docid2)
    q, d1, d2, idf = [data[k] for k in ["query", "posdoc", "negdoc", "idfs"]]

    assert q.shape[0] == idf.shape[0]

    topics = benchmark.topics[benchmark.query_type]
    # emb_path = "glove/light/glove.6B.300d"
    # fullemb = Magnitude(MagnitudeUtils.download_model(emb_path))

    assert len(q) == MAXQLEN
    assert len(d1) == MAXDOCLEN
    assert len(d2) == MAXDOCLEN

    assert len([w for w in q
                if w.sum() != 0]) == len(topics[qid].strip().split()[:MAXQLEN])
    assert len([w for w in d1 if w.sum() != 0]) == len(
        extractor.index.get_doc(docid1).strip().split()[:MAXDOCLEN])
    assert len([w for w in d2 if w.sum() != 0]) == len(
        extractor.index.get_doc(docid2).strip().split()[:MAXDOCLEN])

    # check MissDocError
    error_thrown = False
    try:
        extractor.id2vec(qid, "0000000", "111111")
    except MissingDocError as err:
        error_thrown = True
        assert err.related_qid == qid
        assert err.missed_docid == "0000000"

    assert error_thrown