def test_vectors_deduplicate():
    data = OPS.asarray([[1, 1], [2, 2], [3, 4], [1, 1], [3, 4]], dtype="f")
    v = Vectors(data=data, keys=["a1", "b1", "c1", "a2", "c2"])
    vocab = Vocab()
    vocab.vectors = v
    # duplicate vectors do not use the same keys
    assert (vocab.vectors.key2row[v.strings["a1"]] !=
            vocab.vectors.key2row[v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] !=
            vocab.vectors.key2row[v.strings["c2"]])
    vocab.deduplicate_vectors()
    # there are three unique vectors
    assert vocab.vectors.shape[0] == 3
    # the uniqued data is the same as the deduplicated data
    assert_equal(
        numpy.unique(OPS.to_numpy(vocab.vectors.data), axis=0),
        OPS.to_numpy(vocab.vectors.data),
    )
    # duplicate vectors use the same keys now
    assert (vocab.vectors.key2row[v.strings["a1"]] == vocab.vectors.key2row[
        v.strings["a2"]])
    assert (vocab.vectors.key2row[v.strings["c1"]] == vocab.vectors.key2row[
        v.strings["c2"]])
    # deduplicating again makes no changes
    vocab_b = vocab.to_bytes()
    vocab.deduplicate_vectors()
    assert vocab_b == vocab.to_bytes()
Beispiel #2
0
def test_pickle_vocab(strings, lex_attr):
    vocab = Vocab(strings=strings)
    ops = get_current_ops()
    vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
    vocab.vectors = vectors
    vocab[strings[0]].norm_ = lex_attr
    vocab_pickled = pickle.dumps(vocab)
    vocab_unpickled = pickle.loads(vocab_pickled)
    assert vocab.to_bytes() == vocab_unpickled.to_bytes()
    assert vocab_unpickled.vectors.mode == "floret"