Beispiel #1
0
def test_bagofwords_id2vec(tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "unigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "tokenizer": tokenizer,
            "benchmark": benchmark
        },
    )
    extractor.stoi = {extractor.pad_tok: extractor.pad}
    extractor.itos = {extractor.pad: extractor.pad_tok}
    extractor.idf = defaultdict(lambda: 0)
    # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"])

    extractor.qid2toks = {"301": ["dummy", "doc"]}
    extractor.stoi["dummy"] = 1
    extractor.stoi["doc"] = 2
    extractor.itos[1] = "dummy"
    extractor.itos[2] = "doc"
    extractor.docid2toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0002": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    transformed = extractor.id2vec("301", "LA010189-0001", "LA010189-0001")
    # stoi only knows about the word 'dummy' and 'doc'. So the transformation of every other word is set as 0

    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed["negdocid"] == "LA010189-0001"
    assert np.array_equal(transformed["query"], [0, 1, 1])
    assert np.array_equal(transformed["posdoc"], [6, 3, 0])
    assert np.array_equal(transformed["negdoc"], [6, 3, 0])
    assert np.array_equal(transformed["query_idf"], [0, 0, 0])
Beispiel #2
0
def test_bagofwords_id2vec_trigram(tmpdir, dummy_index):
    benchmark = DummyBenchmark({})
    tok_cfg = {"name": "anserini", "keepstops": True, "stemmer": "none"}
    tokenizer = AnseriniTokenizer(tok_cfg)
    extractor = BagOfWords(
        {
            "name": "bagofwords",
            "datamode": "trigram",
            "maxqlen": 4,
            "maxdoclen": 800,
            "usecache": False
        },
        provide={
            "index": dummy_index,
            "tokenizer": tokenizer,
            "benchmark": benchmark
        },
    )
    extractor.stoi = {extractor.pad_tok: extractor.pad}
    extractor.itos = {extractor.pad: extractor.pad_tok}
    extractor.idf = defaultdict(lambda: 0)
    # extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics["title"])

    extractor.qid2toks = {"301": ["dummy", "doc"]}
    extractor.docid2toks = {
        "LA010189-0001": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
        "LA010189-0002": [
            "dummy", "dummy", "dummy", "hello", "world", "greetings", "from",
            "outer", "space"
        ],
    }
    extractor.stoi["#du"] = 1
    extractor.stoi["dum"] = 2
    extractor.stoi["umm"] = 3
    extractor.itos[1] = "#du"
    extractor.itos[2] = "dum"
    extractor.itos[3] = "umm"
    transformed = extractor.id2vec("301", "LA010189-0001")

    # stoi only knows about the word 'dummy'. So the transformation of every other word is set as 0
    assert transformed["qid"] == "301"
    assert transformed["posdocid"] == "LA010189-0001"
    assert transformed.get("negdocid") is None

    # Right now we have only 3 words in the vocabular - "<pad>", "dummy" and "doc"
    assert np.array_equal(transformed["query"], [5, 1, 1, 1])
    assert np.array_equal(transformed["posdoc"], [
        39, 3, 3, 3
    ])  # There  are 6 unknown words in the doc, so all of them is encoded as 0
    assert np.array_equal(transformed["query_idf"], [0, 0, 0, 0])

    # Learn another word
    extractor.stoi["mmy"] = 4
    extractor.stoi["my#"] = 5
    extractor.stoi["#he"] = 6
    extractor.itos[4] = "mmy"
    extractor.itos[5] = "my#"
    extractor.itos[6] = "#he"

    transformed = extractor.id2vec("301", "LA010189-0001")
    # The posdoc transformation changes to reflect the new word
    assert np.array_equal(transformed["posdoc"], [32, 3, 3, 3, 3, 3, 1])