def test_transform_qid_posdocid_negdocid_with_negdoc(tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.stoi["dummy"] = 1 feature.itos[1] = "dummy" feature.doc_id_to_doc_toks = { "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"], "LA010189-0001": ["dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space"], } transformed = feature.transform_qid_posdocid_negdocid("301", "LA010189-0001", "LA010189-0001") # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0 assert transformed["qid"] == "301" assert transformed["posdocid"] == "LA010189-0001" assert transformed["negdocid"] == "LA010189-0001" assert numpy.array_equal(transformed["query"], [1, 0, 0, 0, 0]) assert numpy.array_equal(transformed["posdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) assert numpy.array_equal(transformed["negdoc"], [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) assert numpy.array_equal(transformed["query_idf"], [0, 0, 0, 0, 0])
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config): # Kind of a useless test - not asserting much here. Still useful since it makes sure that the code at least runs collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, "datamode": "unigram", "passagelen": 3, "slicelen": 20, "tfchannel": True, } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = { "s1": { "train_qids": ["301"], "predict": { "dev": ["301"], "test": ["301"] } } } benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(extractor, "get_magnitude_embeddings", fake_magnitude_embedding) extractor.build_from_benchmark(True) assert extractor.stoi == { "<pad>": 0, "dummy": 1, "doc": 2, "hello": 3, "world": 4, "greetings": 5, "from": 6, "outer": 7, "space": 8, } assert extractor.itos == {v: k for k, v in extractor.stoi.items()}
def test_transform_qid_posdocid_negdocid_only_posdoc(tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "datamode": "unigram", } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) feature = BagOfWords(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.stoi["dummy"] = 1 feature.stoi["doc"] = 2 feature.itos[1] = "dummy" feature.itos[2] = "doc" feature.doc_id_to_doc_toks = { "LA010189-0001": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], "LA010189-0001": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], } transformed = feature.transform_qid_posdocid_negdocid( "301", "LA010189-0001") # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0 assert transformed["qid"] == "301" assert transformed["posdocid"] == "LA010189-0001" assert transformed["negdocid"] is None # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc" assert numpy.array_equal(transformed["query"], [0, 1, 1]) assert numpy.array_equal(transformed["posdoc"], [ 6, 3, 0 ]) # There are 6 unknown words in the doc, so all of them is encoded as 0 assert numpy.array_equal(transformed["query_idf"], [0, 0, 0]) # Learn another word feature.stoi["hello"] = 3 feature.itos[3] = "hello" transformed = feature.transform_qid_posdocid_negdocid( "301", "LA010189-0001") # The posdoc transformation changes to reflect the new word assert numpy.array_equal(transformed["posdoc"], [5, 3, 0, 1])
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, "reranker": "KNRM", } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = {"s1": {"train_qids": ["301"], "predict": {"dev": ["301"], "test": ["301"]}}} benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) # Prevents a download when the unit tests are searcher def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) feature = EmbedText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) monkeypatch.setattr(feature, "get_magnitude_embeddings", fake_magnitude_embedding) feature.build_from_benchmark("glove6b", True) assert feature.stoi == { "<pad>": 0, "dummy": 1, "doc": 2, "hello": 3, "world": 4, "greetings": 5, "from": 6, "outer": 7, "space": 8, } assert feature.itos == {v: k for k, v in feature.stoi.items()} assert numpy.array_equal(feature.embeddings[0], [0, 0, 0, 0, 0, 0, 0, 0]) assert feature.embeddings.shape == (9, 8)
def test_build_from_benchmark_with_trigram(monkeypatch, tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, "datamode": "trigram", } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = { "s1": { "train_qids": ["301"], "predict": { "dev": ["301"], "test": ["301"] } } } benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) feature = BagOfWords(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.build_from_benchmark(True) assert feature.stoi == { "<pad>": 0, "#du": 1, "dum": 2, "umm": 3, "mmy": 4, "my#": 5, "#do": 6, "doc": 7, "oc#": 8, "#he": 9, "hel": 10, "ell": 11, "llo": 12, "lo#": 13, "#wo": 14, "wor": 15, "orl": 16, "rld": 17, "ld#": 18, "#gr": 19, "gre": 20, "ree": 21, "eet": 22, "eti": 23, "tin": 24, "ing": 25, "ngs": 26, "gs#": 27, "#fr": 28, "fro": 29, "rom": 30, "om#": 31, "#ou": 32, "out": 33, "ute": 34, "ter": 35, "er#": 36, "#sp": 37, "spa": 38, "pac": 39, "ace": 40, "ce#": 41, } assert feature.itos == {v: k for k, v in feature.stoi.items()}
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, "datamode": "unigram", } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = { "s1": { "train_qids": ["301"], "predict": { "dev": ["301"], "test": ["301"] } } } benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) feature = BagOfWords(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.build_from_benchmark(True) assert feature.stoi == { "<pad>": 0, "dummy": 1, "doc": 2, "hello": 3, "world": 4, "greetings": 5, "from": 6, "outer": 7, "space": 8, } assert feature.itos == {v: k for k, v in feature.stoi.items()} assert feature.embeddings == { "<pad>": 0, "dummy": 1, "doc": 2, "hello": 3, "world": 4, "greetings": 5, "from": 6, "outer": 7, "space": 8, }
def test_transform_qid_posdocid_negdocid(monkeypatch, tmpdir, trec_index, dummy_collection_config): collection = Collection(dummy_collection_config) pipeline_config = { "indexstops": True, "maxthreads": 1, "stemmer": "anserini", "bmax": 0.2, "k1max": 0.2, "maxqlen": 5, "maxdoclen": 10, "keepstops": True, "rundocsonly": False, } bm25_run = BM25Grid(trec_index, collection, os.path.join(tmpdir, "searcher"), pipeline_config) bm25_run.create() folds = { "s1": { "train_qids": ["301"], "predict": { "dev": ["301"], "test": ["301"] } } } benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config) benchmark.create_and_store_train_and_pred_pairs(folds) feature = BertText(tmpdir, tmpdir, pipeline_config, index=trec_index, collection=collection, benchmark=benchmark) feature.build_from_benchmark() transformed = feature.transform_qid_posdocid_negdocid( "301", "LA010189-0001", "LA010189-0001") assert np.array_equal( transformed["postoks"], [ 101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088, 1010, 14806, 2015, 2013, 6058, 102 ], ) assert np.array_equal( transformed["posmask"], [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert np.array_equal( transformed["possegs"], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert np.array_equal(transformed["posqmask"], [1, 1, 0, 0, 0]) assert np.array_equal(transformed["posdmask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert np.array_equal( transformed["negtoks"], [ 101, 24369, 9986, 0, 0, 0, 102, 24369, 24369, 24369, 7592, 2088, 1010, 14806, 2015, 2013, 6058, 102 ], ) assert np.array_equal( transformed["negmask"], [1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert np.array_equal( transformed["negsegs"], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert np.array_equal(transformed["negqmask"], [1, 1, 0, 0, 0]) assert np.array_equal(transformed["negdmask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) assert transformed["posdocid"] == "LA010189-0001" assert transformed["negdocid"] == "LA010189-0001" assert transformed["qid"] == "301"