def test_bagofwords_id2vec(tmpdir, dummy_index): benchmark = DummyBenchmark({}) tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"} tokenizer = AnseriniTokenizer(tok_cfg) extractor = BagOfWords( { "name": "bagofwords", "datamode": "unigram", "maxqlen": 4, "maxdoclen": 800, "usecache": False }, provide={ "index": dummy_index, "tokenizer": tokenizer, "benchmark": benchmark }, ) extractor.stoi = {extractor.pad_tok: extractor.pad} extractor.itos = {extractor.pad: extractor.pad_tok} extractor.idf = defaultdict(lambda: 0) # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"]) extractor.qid2toks = {"301": ["dummy", "doc"]} extractor.stoi["dummy"] = 1 extractor.stoi["doc"] = 2 extractor.itos[1] = "dummy" extractor.itos[2] = "doc" extractor.docid2toks = { "LA010189-0001": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], "LA010189-0002": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], } transformed = extractor.id2vec("301", "LA010189-0001", "LA010189-0001") # stoi only knows about the word 'dummy' and 'doc'. So the transformation of every other word is set as 0 assert transformed["qid"] == "301" assert transformed["posdocid"] == "LA010189-0001" assert transformed["negdocid"] == "LA010189-0001" assert np.array_equal(transformed["query"], [0, 1, 1]) assert np.array_equal(transformed["posdoc"], [6, 3, 0]) assert np.array_equal(transformed["negdoc"], [6, 3, 0]) assert np.array_equal(transformed["query_idf"], [0, 0, 0])
def test_bagofwords_id2vec_trigram(tmpdir, dummy_index): benchmark = DummyBenchmark({}) tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"} tokenizer = AnseriniTokenizer(tok_cfg) extractor = BagOfWords( { "name": "bagofwords", "datamode": "trigram", "maxqlen": 4, "maxdoclen": 800, "usecache": False }, provide={ "index": dummy_index, "tokenizer": tokenizer, "benchmark": benchmark }, ) extractor.stoi = {extractor.pad_tok: extractor.pad} extractor.itos = {extractor.pad: extractor.pad_tok} extractor.idf = defaultdict(lambda: 0) # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"]) extractor.qid2toks = {"301": ["dummy", "doc"]} extractor.docid2toks = { "LA010189-0001": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], "LA010189-0002": [ "dummy", "dummy", "dummy", "hello", "world", "greetings", "from", "outer", "space" ], } extractor.stoi["#du"] = 1 extractor.stoi["dum"] = 2 extractor.stoi["umm"] = 3 extractor.itos[1] = "#du" extractor.itos[2] = "dum" extractor.itos[3] = "umm" transformed = extractor.id2vec("301", "LA010189-0001") # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0 assert transformed["qid"] == "301" assert transformed["posdocid"] == "LA010189-0001" assert transformed.get("negdocid") is None # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc" assert np.array_equal(transformed["query"], [5, 1, 1, 1]) assert np.array_equal(transformed["posdoc"], [ 39, 3, 3, 3 ]) # There are 6 unknown words in the doc, so all of them is encoded as 0 assert np.array_equal(transformed["query_idf"], [0, 0, 0, 0]) # Learn another word extractor.stoi["mmy"] = 4 extractor.stoi["my#"] = 5 extractor.stoi["#he"] = 6 extractor.itos[4] = "mmy" extractor.itos[5] = "my#" extractor.itos[6] = "#he" transformed = extractor.id2vec("301", "LA010189-0001") # The posdoc transformation changes to reflect the new word assert np.array_equal(transformed["posdoc"], [32, 3, 3, 3, 3, 3, 1])