Esempio n. 1
0
def test_tokenize_text(trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "unigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, False)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {}
Esempio n. 2
0
def test_tokenize_text_with_calculate_idf(dummy_collection_config, trec_index,
                                          tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "unigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, True)
    assert feature.stoi == {"<pad>": 0, "to": 1, "be": 2, "or": 3, "not": 4}

    assert feature.idf == {
        "be": 1.791759469228055,
        "not": 1.791759469228055,
        "or": 1.791759469228055,
        "to": 1.791759469228055
    }
Esempio n. 3
0
def test_tokenize_text_trigram(trec_index, tmpdir):
    toks_list = [["to", "be", "or", "not", "to", "be"]]
    feature = BagOfWords(tmpdir,
                         tmpdir, {"datamode": "trigram"},
                         index=trec_index)
    feature.build_stoi(toks_list, True, False)

    # trigrams would be - ['#to', 'to#', '#be', 'be#', '#or', 'or#', "#no', 'not', "ot#']
    assert feature.stoi == {
        "<pad>": 0,
        "#to": 1,
        "to#": 2,
        "#be": 3,
        "be#": 4,
        "#or": 5,
        "or#": 6,
        "#no": 7,
        "not": 8,
        "ot#": 9
    }

    assert feature.idf == {}