Esempio n. 1
0
def test_build_from_benchmark_with_trigram(monkeypatch, tmpdir, trec_index,
                                           dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "trigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "#du": 1,
        "dum": 2,
        "umm": 3,
        "mmy": 4,
        "my#": 5,
        "#do": 6,
        "doc": 7,
        "oc#": 8,
        "#he": 9,
        "hel": 10,
        "ell": 11,
        "llo": 12,
        "lo#": 13,
        "#wo": 14,
        "wor": 15,
        "orl": 16,
        "rld": 17,
        "ld#": 18,
        "#gr": 19,
        "gre": 20,
        "ree": 21,
        "eet": 22,
        "eti": 23,
        "tin": 24,
        "ing": 25,
        "ngs": 26,
        "gs#": 27,
        "#fr": 28,
        "fro": 29,
        "rom": 30,
        "om#": 31,
        "#ou": 32,
        "out": 33,
        "ute": 34,
        "ter": 35,
        "er#": 36,
        "#sp": 37,
        "spa": 38,
        "pac": 39,
        "ace": 40,
        "ce#": 41,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
Esempio n. 2
0
def test_build_from_benchmark(monkeypatch, tmpdir, trec_index,
                              dummy_collection_config):
    collection = Collection(dummy_collection_config)
    pipeline_config = {
        "indexstops": True,
        "maxthreads": 1,
        "stemmer": "anserini",
        "bmax": 0.2,
        "k1max": 0.2,
        "maxqlen": 5,
        "maxdoclen": 10,
        "keepstops": True,
        "rundocsonly": False,
        "datamode": "unigram",
    }
    bm25_run = BM25Grid(trec_index, collection,
                        os.path.join(tmpdir, "searcher"), pipeline_config)
    bm25_run.create()
    folds = {
        "s1": {
            "train_qids": ["301"],
            "predict": {
                "dev": ["301"],
                "test": ["301"]
            }
        }
    }
    benchmark = Robust04Benchmark(bm25_run, collection, pipeline_config)
    benchmark.create_and_store_train_and_pred_pairs(folds)

    feature = BagOfWords(tmpdir,
                         tmpdir,
                         pipeline_config,
                         index=trec_index,
                         collection=collection,
                         benchmark=benchmark)

    feature.build_from_benchmark(True)
    assert feature.stoi == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }

    assert feature.itos == {v: k for k, v in feature.stoi.items()}
    assert feature.embeddings == {
        "<pad>": 0,
        "dummy": 1,
        "doc": 2,
        "hello": 3,
        "world": 4,
        "greetings": 5,
        "from": 6,
        "outer": 7,
        "space": 8,
    }