def test_tokenize_text_with_calculate_idf(dummy_collection_config, trec_index, tmpdir): toks_list = [["to", "be", "or", "not", "to", "be"]] feature = EmbedText(tmpdir, tmpdir, {}, index=trec_index) feature.build_stoi(toks_list, True, True) assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4} assert feature.idf == {"be": 1.791759469228055, "not": 1.791759469228055, "or": 1.791759469228055, "to": 1.791759469228055}
def test_tokenize_text(trec_index, tmpdir): toks_list = [["to", "be", "or", "not", "to", "be"]] feature = EmbedText(tmpdir, tmpdir, {}, index=trec_index) feature.build_stoi(toks_list, True, False) assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4} assert feature.idf == {}
def test_create_embedding_matrix(monkeypatch, tmpdir, trec_index): feature = EmbedText(tmpdir, tmpdir, {"reranker": "KNRM"}, index=trec_index) feature.stoi = {"<pad>": 0, "hello": 1, "world": 2} # Prevents a download when the unit tests are searcher def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding) matrix = feature.create_embedding_matrix("glove6b") # We cannot assert the entire matrix because since there are no downloaded embeddings, the embedding for a word # would be random each time we searcher the test assert matrix.shape == (3, 8) assert numpy.array_equal(matrix[0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
def test_train_sampler(monkeypatch, tmpdir): benchmark = DummyBenchmark() extractor = EmbedText( {"tokenizer": {"keepstops": True}}, provide={"collection": benchmark.collection, "benchmark": benchmark} ) training_judgments = benchmark.qrels.copy() train_dataset = TrainTripletSampler() train_dataset.prepare(training_judgments, training_judgments, extractor) def mock_id2vec(*args, **kwargs): return {"query": np.array([1, 2, 3, 4]), "posdoc": np.array([1, 1, 1, 1]), "negdoc": np.array([2, 2, 2, 2])} monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec) dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) for idx, batch in enumerate(dataloader): assert len(batch["query"]) == 32 assert len(batch["posdoc"]) == 32 assert len(batch["negdoc"]) == 32 assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4])) assert np.array_equal(batch["query"][30], np.array([1, 2, 3, 4])) assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1])) assert np.array_equal(batch["posdoc"][30], np.array([1, 1, 1, 1])) assert np.array_equal(batch["negdoc"][0], np.array([2, 2, 2, 2])) assert np.array_equal(batch["negdoc"][30], np.array([2, 2, 2, 2])) # Just making sure that the dataloader can do multiple iterations if idx > 3: break
def test_pred_sampler(monkeypatch, tmpdir): benchmark = DummyBenchmark() extractor = EmbedText({"tokenizer": { "keepstops": True }}, provide={"collection": benchmark.collection}) search_run = {"301": {"LA010189-0001": 50, "LA010189-0002": 100}} pred_dataset = PredSampler() pred_dataset.prepare(benchmark.qrels, search_run, extractor) def mock_id2vec(*args, **kwargs): return { "query": np.array([1, 2, 3, 4]), "posdoc": np.array([1, 1, 1, 1]) } monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec) dataloader = torch.utils.data.DataLoader(pred_dataset, batch_size=2) for idx, batch in enumerate(dataloader): print(idx, batch) assert len(batch["query"]) == 2 assert len(batch["posdoc"]) == 2 assert batch.get("negdoc") is None assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4])) assert np.array_equal(batch["query"][1], np.array([1, 2, 3, 4])) assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1])) assert np.array_equal(batch["posdoc"][1], np.array([1, 1, 1, 1]))
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, "reranker": "KNRM", } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = {"s1": {"train_qids": ["301"], "predict": {"dev": ["301"], "test": ["301"]}}} benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) # Prevents a download when the unit tests are searcher def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding) feature.build_from_benchmark("glove6b", True) assert feature.stoi == { "<pad>": 0, "dummy": 1, "doc": 2, "hello": 3, "world": 4, "greetings": 5, "from": 6, "outer": 7, "space": 8, } assert feature.itos == {v: k for k, v in feature.stoi.items()} assert numpy.array_equal(feature.embeddings[0], [0, 0, 0, 0, 0, 0, 0, 0]) assert feature.embeddings.shape == (9, 8)
def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.stoi["dummy"] = 1 feature.itos[1] = "dummy" feature.doc_id_to_doc_toks = { "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"], "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"], } transformed = feature.transform_qid_posdocid_negdocid("301", "LA010189-0001", "LA010189-0001") # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0 assert transformed["qid"] == "301" assert transformed["posdocid"] == "LA010189-0001" assert transformed["negdocid"] == "LA010189-0001" assert numpy.array_equal(transformed["query"], [1, 0, 0, 0, 0]) assert numpy.array_equal(transformed["posdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) assert numpy.array_equal(transformed["negdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) assert numpy.array_equal(transformed["query_idf"], [0, 0, 0, 0, 0])
def test_embedtext_id2vec(monkeypatch): def fake_load_embeddings(self): vocab = [ "<pad>", "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from", "outer", "space" ] self.embeddings = np.random.random((len(vocab), 50)) self.embeddings[0, :] = 0 self.stoi = {term: idx for idx, term in enumerate(vocab)} self.itos = {v: k for k, v in self.stoi.items()} monkeypatch.setattr(EmbedText, "_load_pretrained_embeddings", fake_load_embeddings) benchmark = DummyBenchmark() extractor_cfg = { "name": "embedtext", "embeddings": "glove6b", "calcidf": True, "maxqlen": MAXQLEN, "maxdoclen": MAXDOCLEN } extractor = EmbedText(extractor_cfg, provide={ "collection": DummyCollection(), "benchmark": benchmark }) qids = list(benchmark.qrels.keys()) # ["301"] qid = qids[0] docids = list(benchmark.qrels[qid].keys()) extractor.preprocess(qids, docids, benchmark.topics[benchmark.query_type]) docid1, docid2 = docids[0], docids[1] data = extractor.id2vec(qid, docid1, docid2) q, d1, d2, idf = [data[k] for k in ["query", "posdoc", "negdoc", "idfs"]] assert q.shape[0] == idf.shape[0] topics = benchmark.topics[benchmark.query_type] # emb_path = "glove/light/glove.6B.300d" # fullemb = Magnitude(MagnitudeUtils.download_model(emb_path)) assert len(q) == MAXQLEN assert len(d1) == MAXDOCLEN assert len(d2) == MAXDOCLEN assert len([w for w in q if w.sum() != 0]) == len(topics[qid].strip().split()[:MAXQLEN]) assert len([w for w in d1 if w.sum() != 0]) == len( extractor.index.get_doc(docid1).strip().split()[:MAXDOCLEN]) assert len([w for w in d2 if w.sum() != 0]) == len( extractor.index.get_doc(docid2).strip().split()[:MAXDOCLEN]) # check MissDocError error_thrown = False try: extractor.id2vec(qid, "0000000", "111111") except MissingDocError as err: error_thrown = True assert err.related_qid == qid assert err.missed_docid == "0000000" assert error_thrown