def test_specials_get_pad_symbol(): voc = vocab.Vocab(specials=(vocab.PAD(),)) data = ["tree", "plant", "grass"] voc = voc + set(data) assert voc.get_padding_index() == 0 voc.finalize() assert voc.itos[0] == vocab.PAD()
def test_add_vocab_to_vocab(): data1 = ["w1", "w2", "w3"] data2 = ["a1", "a2"] data3 = ["w1", "a2"] expected_freq = {"w1": 2, "w2": 1, "w3": 1, "a1": 1, "a2": 2} voc1 = vocab.Vocab() voc1 += data1 voc1 += data3 voc2 = vocab.Vocab() voc2 += data2 voc = voc1 + voc2 assert not voc.is_finalized for word in voc.get_freqs(): assert voc.get_freqs()[word] == expected_freq[word] voc3 = vocab.Vocab(specials=vocab.UNK()) voc3 += data1 voc3 += data3 voc3.finalize() voc4 = vocab.Vocab(specials=vocab.PAD()) voc4 += data2 voc4.finalize() voc = voc3 + voc4 assert set(voc.specials) == { vocab.PAD(), vocab.UNK(), } assert voc.is_finalized assert len(voc.itos) == 7
def test_special_vocab_symbols(): assert str(vocab.PAD()) == "<PAD>" assert str(vocab.UNK()) == "<UNK>" assert str(vocab.PAD("<my_pad>")) == "<my_pad>" assert str(vocab.UNK("<my_unk>")) == "<my_unk>" # These hold due to overloaded hash/eq assert vocab.PAD("<my_pad>") == vocab.PAD() assert vocab.UNK("<my_unk>") == vocab.UNK()
def test_iadd_vocab_to_vocab(): data1 = ["w1", "w2", "w3"] data2 = ["a1", "a2", "w1"] expected_freqs = {"w1": 2, "w2": 1, "w3": 1, "a1": 1, "a2": 1} voc1 = vocab.Vocab(specials=vocab.PAD()) voc1 += data1 voc2 = vocab.Vocab(specials=vocab.UNK()) voc2 += data2 voc1 += voc2 assert voc1.get_freqs() == expected_freqs assert all(spec in voc1.specials for spec in (vocab.PAD(), vocab.UNK()))
def test_get_stoi_for_unknown_word_default_unk(): specials = [vocab.PAD(), vocab.UNK()] voc = vocab.Vocab(specials=specials) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} voc.finalize() assert voc.numericalize("unknown") == 1
def test_size_after_final_with_specials(): specials = [vocab.PAD(), vocab.UNK()] voc = vocab.Vocab(specials=specials) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} voc.finalize() assert len(voc) == len(data) + len(specials)
def test_max_size_with_specials(): voc = vocab.Vocab( max_size=2, specials=[vocab.PAD(), vocab.UNK()], ) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} voc.finalize() assert len(voc) == 2
def test_vocab_static_constructors(): specials = [vocab.PAD(), vocab.UNK()] voc = vocab.Vocab(specials=specials) data = ["tree", "plant", "grass"] voc = (voc + set(data)) + {"plant"} voc.finalize() itos2voc = vocab.Vocab.from_itos(voc.itos) # Only the frequencies will be different because # we don't transfer this information, so the full # vocab1 == vocab2 will fail. Perhaps split equality # checks for vocab on before/after finalization? assert itos2voc.itos == voc.itos assert itos2voc.stoi == voc.stoi assert itos2voc.specials == voc.specials stoi2voc = vocab.Vocab.from_stoi(voc.stoi) assert stoi2voc.itos == voc.itos assert stoi2voc.stoi == voc.stoi assert stoi2voc.specials == voc.specials
voc = voc + set(data) voc.finalize() # Tree is in vocab assert len(voc.numericalize("tree")) == 1 # Apple isn't in vocab assert len(voc.numericalize("apple")) == 0 # Try with list argument assert len(voc.numericalize(["tree", "apple"])) == 1 @pytest.mark.parametrize( "default_instance, second_default_instance, custom_instance", [ (vocab.UNK(), vocab.UNK(), vocab.UNK("<my_unknown>")), (vocab.PAD(), vocab.PAD(), vocab.PAD("<my_pad>")), (vocab.BOS(), vocab.BOS(), vocab.BOS("<my_bos>")), (vocab.EOS(), vocab.EOS(), vocab.EOS("<my_eos>")), ], ) def test_specials_uniqueness(default_instance, second_default_instance, custom_instance): with pytest.raises(ValueError): vocab.Vocab(specials=[default_instance, second_default_instance]) with pytest.raises(ValueError): vocab.Vocab(specials=[default_instance, custom_instance]) def test_specials_get_pad_symbol(): voc = vocab.Vocab(specials=(vocab.PAD(),)) data = ["tree", "plant", "grass"]