Beispiel #1
0
def test_get_standardizing_inverse():
    std_inv = tokenization.get_standardizing_inverse(
        VOCABULARY_FILE,
        lambda t: tokenization.standardize_text(t, stemming="porter_stemmer"),
    )
    assert std_inv["memori"] == "memory"
    assert std_inv["work memori"] == "working memory"
    assert std_inv["nerv"] == "nerves"
Beispiel #2
0
def test_standardize_text():
    text = "One a the Word abcd-eft: --\nhello\t 1240"
    assert (
        tokenization.standardize_text(text) == "one word abcd eft hello 1240")