def test_vectors_deduplicate():
    data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f")
    v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"])
    vocab = Vocab()
    vocab.vectors = v
    # duplicate vectors do not use the same keys
    assert (vocab.vectors.key2row[v.strings["a1"]] !=
            vocab.vectors.key2row[v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] !=
            vocab.vectors.key2row[v.strings["c2"]])
    vocab.deduplicate_vectors()
    # there are three unique vectors
    assert vocab.vectors.shape[0] == 3
    # the uniqued data is the same as the deduplicated data
    assert_equal(
        numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0),
        OPS.to_numpy(vocab.vectors.data),
    )
    # duplicate vectors use the same keys now
    assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[
        v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[
        v.strings["c2"]])
    # deduplicating again makes no changes
    vocab_b = vocab.to_bytes()
    vocab.deduplicate_vectors()
    assert vocab_b == vocab.to_bytes()
def test_lookups_to_from_bytes_via_vocab():
    table_name = "test"
    vocab = Vocab()
    vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
    assert table_name in vocab.lookups
    vocab_bytes = vocab.to_bytes()
    new_vocab = Vocab()
    new_vocab.from_bytes(vocab_bytes)
    assert len(new_vocab.lookups) == len(vocab.lookups)
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
    assert table["hello"] == "world"
    assert new_vocab.to_bytes() == vocab_bytes
Exemple #3
0
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
    if strings1 == strings2:
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
    vocab1 = vocab1.from_bytes(vocab1_b)
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
    assert len(new_vocab1.strings) == len(strings1) + 1  # adds _SP
    assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + ["_SP"])
Exemple #4
0
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
    if strings1 == strings2:
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
    vocab1 = vocab1.from_bytes(vocab1_b)
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
    assert len(new_vocab1) == len(strings1)
    assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
    if strings1 == strings2:
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
    vocab1 = vocab1.from_bytes(vocab1_b)
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
    assert len(new_vocab1) == len(strings1)
    assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
Exemple #6
0
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    vocab2 = vocab2.from_bytes(vocab1.to_bytes())
    assert vocab2[strings[0]].norm_ == lex_attr
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    vocab2 = vocab2.from_bytes(vocab1.to_bytes())
    assert vocab2[strings[0]].norm_ == lex_attr
Exemple #8
0
def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
    ops = get_current_ops()
    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
    vocab.vectors = vectors
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
    assert vocab_unpickled.vectors.mode == "floret"
Exemple #9
0
def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    vocab.from_bytes(vocab.to_bytes())
    assert len(vocab.strings) == len(strings) + 1  # adds _SP
Exemple #10
0
def test_serialize_vocab(en_vocab, text):
    text_hash = en_vocab.strings.add(text)
    vocab_bytes = en_vocab.to_bytes(exclude=["lookups"])
    new_vocab = Vocab().from_bytes(vocab_bytes)
    assert new_vocab.strings[text_hash] == text
    assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
Exemple #11
0
def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
def test_deserialize_vocab_seen_entries(strings, lex_attr):
    # Reported in #2153
    vocab = Vocab(strings=strings)
    length = len(vocab)
    vocab.from_bytes(vocab.to_bytes())
    assert len(vocab) == length