Exemple #1
0
def test_get_vector_dimension():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()
        assert vect.get_vector_dim() == vect["."].shape[0]
        assert vect.get_vector_dim() == 3
Exemple #2
0
def test_basic_not_initialized():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect["."]
        with pytest.raises(RuntimeError):
            vect.token_to_vector(".")
Exemple #3
0
def test_basic_max_vectors_bigger_than_num_lines():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path, max_vectors=20)
        vect.load_all()
        assert len(vect._vectors) == 4
        contained_elements = [".", "'", ":", ","]
        assert all(elem in vect._vectors for elem in contained_elements)
Exemple #4
0
def test_basic_path_none_cache_doesnt_exist(tmpdir):
    base = tmpdir
    assert os.path.exists(base)
    cache_path = os.path.join(base, "cache.t")
    assert not os.path.exists(cache_path)

    vect = vectorizer.WordVectors(path=None, cache_path=cache_path)
    with pytest.raises(ValueError):
        vect.load_all()
Exemple #5
0
def test_basic_token_none():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      default_vector_function=None)
        vect.load_all()
        with pytest.raises(ValueError):
            vect[None]
        with pytest.raises(ValueError):
            vect.token_to_vector(None)
Exemple #6
0
def test_basic_max_vectors_vocab():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path, max_vectors=2)
        vocab = [".", ":", ","]
        vect.load_vocab(vocab)
        assert len(vect._vectors) == 2
        contained_elements = [".", ":"]
        assert all(elem in vect._vectors for elem in contained_elements)
        uncontained_elements = ["'", ","]
        assert all(elem not in vect._vectors for elem in uncontained_elements)
Exemple #7
0
def test_get_embedding_matrix(tokens, expected_matrix, expected_shape):
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()

        embedding_matrix = vect.get_embedding_matrix(vocab=tokens)
        assert embedding_matrix.shape == expected_shape
        assert np.allclose(a=embedding_matrix,
                           b=expected_matrix,
                           rtol=0,
                           atol=1e-6)
Exemple #8
0
def test_basic_token_default():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(
            path=vect_file_path,
            default_vector_function=vectorizer.zeros_default_vector)
        vect.load_all()
        assert "a" not in vect._vectors
        assert vect["a"].shape == (3, )
        assert np.allclose(a=vect.token_to_vector("a"),
                           b=np.zeros(3),
                           rtol=0,
                           atol=1.0e-6)
Exemple #9
0
def test_load_plain_text():
    filename = "test.txt"
    with tempfile.TemporaryDirectory() as tmpdir:
        file_path = os.path.join(tmpdir, filename)
        with open(file_path, mode="w") as file:
            assert os.path.exists(file_path)
            file.writelines(BASIC_VECT_DATA_PLAIN)

        vec_storage = vectorizer.WordVectors(file_path, binary=False)
        vec_storage.load_all()

    assert len(vec_storage) == 4

    for token, vec in BASIC_VECT_DATA_DICT.items():
        assert np.all(vec == vec_storage[token])
Exemple #10
0
def test_basic_load_all_vectors():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()
        assert len(vect._vectors) == 4
        assert vect["."].shape == (3, )
        assert vect.token_to_vector(",").shape == (3, )
        assert np.allclose(a=vect["."],
                           b=BASIC_VECT_DATA_DICT["."],
                           rtol=0,
                           atol=1.0e-6)
        assert np.allclose(a=vect.token_to_vector(","),
                           b=BASIC_VECT_DATA_DICT[","],
                           rtol=0,
                           atol=1.0e-6)
Exemple #11
0
def test_basic_cache_vocab():
    with tempfile.TemporaryDirectory() as base:
        with create_temp_vect_file(vect_file_name="vect1",
                                   file_data=BASIC_VECT_DATA,
                                   base_dir=base) as vect_file_path:
            assert os.path.exists(vect_file_path)
            cache_path = os.path.join(base, "cache.t")
            assert not os.path.exists(cache_path)
            vect = vectorizer.WordVectors(path=vect_file_path,
                                          cache_path=cache_path)

            vocab = [".", ":", ","]
            vect.load_vocab(vocab)

            assert os.path.exists(cache_path)

            with open(cache_path, "rb") as cache_file:
                content = cache_file.readlines()
                assert len(content) == 3
Exemple #12
0
def test_basic_cache_max_vectors(tmpdir):
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA,
                               base_dir=tmpdir) as vect_file_path:
        assert os.path.exists(vect_file_path)
        cache_path = os.path.join(tmpdir, "cache.t")
        assert not os.path.exists(cache_path)
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      max_vectors=2,
                                      cache_path=cache_path)
        vect.load_all()
        assert os.path.exists(cache_path)
        with open(cache_path, "rb") as cache_file:
            content = cache_file.readlines()
            assert len(content) == 2
            first_line_parts = content[0].split(b" ")
            word, values = first_line_parts[0], first_line_parts[1:]
            assert word == b"."
            assert len(values) == 3
Exemple #13
0
def test_basic_load_vocab():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      default_vector_function=None)
        vocab = [".", ":"]
        vect.load_vocab(vocab=vocab)
        assert len(vect._vectors) == 2
        assert vect["."].shape == (3, )
        assert vect.token_to_vector(":").shape == (3, )
        assert np.allclose(a=vect[":"],
                           b=BASIC_VECT_DATA_DICT[":"],
                           rtol=0,
                           atol=1.0e-6)
        assert np.allclose(a=vect.token_to_vector("."),
                           b=BASIC_VECT_DATA_DICT["."],
                           rtol=0,
                           atol=1.0e-6)

        with pytest.raises(KeyError):
            vect[","]
        with pytest.raises(KeyError):
            vect.token_to_vector(",")
Exemple #14
0
def test_basic_diff_dimensions():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=DIFF_DIM_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect.load_all()
Exemple #15
0
def test_basic_load_vocab_none():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(ValueError):
            vect.load_vocab(vocab=None)
Exemple #16
0
def test_get_vector_dim_not_initialized_vector_storage():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect.get_vector_dim()
Exemple #17
0
def test_basic_both_paths_none():
    vect = vectorizer.WordVectors(path=None, cache_path=None)
    with pytest.raises(ValueError):
        vect.load_all()