Exemple #1
0
def test_add_vocab_to_vocab():
    data1 = ["w1", "w2", "w3"]
    data2 = ["a1", "a2"]
    data3 = ["w1", "a2"]
    expected_freq = {"w1": 2, "w2": 1, "w3": 1, "a1": 1, "a2": 2}

    voc1 = vocab.Vocab()
    voc1 += data1
    voc1 += data3

    voc2 = vocab.Vocab()
    voc2 += data2

    voc = voc1 + voc2
    assert not voc.is_finalized
    for word in voc.get_freqs():
        assert voc.get_freqs()[word] == expected_freq[word]

    voc3 = vocab.Vocab(specials=vocab.UNK())
    voc3 += data1
    voc3 += data3
    voc3.finalize()

    voc4 = vocab.Vocab(specials=vocab.PAD())
    voc4 += data2
    voc4.finalize()

    voc = voc3 + voc4
    assert set(voc.specials) == {
        vocab.PAD(),
        vocab.UNK(),
    }
    assert voc.is_finalized
    assert len(voc.itos) == 7
Exemple #2
0
def test_special_vocab_symbols():
    assert str(vocab.PAD()) == "<PAD>"
    assert str(vocab.UNK()) == "<UNK>"

    assert str(vocab.PAD("<my_pad>")) == "<my_pad>"
    assert str(vocab.UNK("<my_unk>")) == "<my_unk>"

    # These hold due to overloaded hash/eq
    assert vocab.PAD("<my_pad>") == vocab.PAD()
    assert vocab.UNK("<my_unk>") == vocab.UNK()
Exemple #3
0
def test_iadd_vocab_to_vocab():
    data1 = ["w1", "w2", "w3"]
    data2 = ["a1", "a2", "w1"]
    expected_freqs = {"w1": 2, "w2": 1, "w3": 1, "a1": 1, "a2": 1}

    voc1 = vocab.Vocab(specials=vocab.PAD())
    voc1 += data1

    voc2 = vocab.Vocab(specials=vocab.UNK())
    voc2 += data2

    voc1 += voc2

    assert voc1.get_freqs() == expected_freqs
    assert all(spec in voc1.specials for spec in (vocab.PAD(), vocab.UNK()))
Exemple #4
0
def test_get_stoi_for_unknown_word_default_unk():
    specials = [vocab.PAD(), vocab.UNK()]
    voc = vocab.Vocab(specials=specials)
    data = ["tree", "plant", "grass"]
    voc = (voc + set(data)) + {"plant"}
    voc.finalize()
    assert voc.numericalize("unknown") == 1
Exemple #5
0
def test_size_after_final_with_specials():
    specials = [vocab.PAD(), vocab.UNK()]
    voc = vocab.Vocab(specials=specials)
    data = ["tree", "plant", "grass"]
    voc = (voc + set(data)) + {"plant"}
    voc.finalize()
    assert len(voc) == len(data) + len(specials)
Exemple #6
0
def test_max_size_with_specials():
    voc = vocab.Vocab(
        max_size=2,
        specials=[vocab.PAD(), vocab.UNK()],
    )
    data = ["tree", "plant", "grass"]
    voc = (voc + set(data)) + {"plant"}
    voc.finalize()
    assert len(voc) == 2
Exemple #7
0
def test_vocab_static_constructors():
    specials = [vocab.PAD(), vocab.UNK()]
    voc = vocab.Vocab(specials=specials)
    data = ["tree", "plant", "grass"]
    voc = (voc + set(data)) + {"plant"}
    voc.finalize()

    itos2voc = vocab.Vocab.from_itos(voc.itos)
    # Only the frequencies will be different because
    # we don't transfer this information, so the full
    # vocab1 == vocab2 will fail. Perhaps split equality
    # checks for vocab on before/after finalization?

    assert itos2voc.itos == voc.itos
    assert itos2voc.stoi == voc.stoi
    assert itos2voc.specials == voc.specials

    stoi2voc = vocab.Vocab.from_stoi(voc.stoi)
    assert stoi2voc.itos == voc.itos
    assert stoi2voc.stoi == voc.stoi
    assert stoi2voc.specials == voc.specials
Exemple #8
0
    data = ["tree", "plant", "grass"]
    voc = voc + set(data)
    voc.finalize()

    # Tree is in vocab
    assert len(voc.numericalize("tree")) == 1
    # Apple isn't in vocab
    assert len(voc.numericalize("apple")) == 0
    # Try with list argument
    assert len(voc.numericalize(["tree", "apple"])) == 1


@pytest.mark.parametrize(
    "default_instance, second_default_instance, custom_instance",
    [
        (vocab.UNK(), vocab.UNK(), vocab.UNK("<my_unknown>")),
        (vocab.PAD(), vocab.PAD(), vocab.PAD("<my_pad>")),
        (vocab.BOS(), vocab.BOS(), vocab.BOS("<my_bos>")),
        (vocab.EOS(), vocab.EOS(), vocab.EOS("<my_eos>")),
    ],
)
def test_specials_uniqueness(default_instance, second_default_instance, custom_instance):
    with pytest.raises(ValueError):
        vocab.Vocab(specials=[default_instance, second_default_instance])

    with pytest.raises(ValueError):
        vocab.Vocab(specials=[default_instance, custom_instance])


def test_specials_get_pad_symbol():
    voc = vocab.Vocab(specials=(vocab.PAD(),))