Ejemplo n.º 1
0
def test_get_vector_dimension():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()
        assert vect.get_vector_dim() == vect["."].shape[0]
        assert vect.get_vector_dim() == 3
Ejemplo n.º 2
0
def test_basic_not_initialized():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect["."]
        with pytest.raises(RuntimeError):
            vect.token_to_vector(".")
Ejemplo n.º 3
0
def test_basic_max_vectors_bigger_than_num_lines():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path, max_vectors=20)
        vect.load_all()
        assert len(vect._vectors) == 4
        contained_elements = [".", "'", ":", ","]
        assert all(elem in vect._vectors for elem in contained_elements)
Ejemplo n.º 4
0
def test_basic_path_none_cache_doesnt_exist(tmpdir):
    base = tmpdir
    assert os.path.exists(base)
    cache_path = os.path.join(base, "cache.t")
    assert not os.path.exists(cache_path)

    vect = vectorizer.WordVectors(path=None, cache_path=cache_path)
    with pytest.raises(ValueError):
        vect.load_all()
Ejemplo n.º 5
0
def test_basic_token_none():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      default_vector_function=None)
        vect.load_all()
        with pytest.raises(ValueError):
            vect[None]
        with pytest.raises(ValueError):
            vect.token_to_vector(None)
Ejemplo n.º 6
0
def test_basic_max_vectors_vocab():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path, max_vectors=2)
        vocab = [".", ":", ","]
        vect.load_vocab(vocab)
        assert len(vect._vectors) == 2
        contained_elements = [".", ":"]
        assert all(elem in vect._vectors for elem in contained_elements)
        uncontained_elements = ["'", ","]
        assert all(elem not in vect._vectors for elem in uncontained_elements)
Ejemplo n.º 7
0
def test_get_embedding_matrix(tokens, expected_matrix, expected_shape):
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()

        embedding_matrix = vect.get_embedding_matrix(vocab=tokens)
        assert embedding_matrix.shape == expected_shape
        assert np.allclose(a=embedding_matrix,
                           b=expected_matrix,
                           rtol=0,
                           atol=1e-6)
Ejemplo n.º 8
0
def test_basic_token_default():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(
            path=vect_file_path,
            default_vector_function=vectorizer.zeros_default_vector)
        vect.load_all()
        assert "a" not in vect._vectors
        assert vect["a"].shape == (3, )
        assert np.allclose(a=vect.token_to_vector("a"),
                           b=np.zeros(3),
                           rtol=0,
                           atol=1.0e-6)
Ejemplo n.º 9
0
def test_load_plain_text():
    filename = "test.txt"
    with tempfile.TemporaryDirectory() as tmpdir:
        file_path = os.path.join(tmpdir, filename)
        with open(file_path, mode="w") as file:
            assert os.path.exists(file_path)
            file.writelines(BASIC_VECT_DATA_PLAIN)

        vec_storage = vectorizer.WordVectors(file_path, binary=False)
        vec_storage.load_all()

    assert len(vec_storage) == 4

    for token, vec in BASIC_VECT_DATA_DICT.items():
        assert np.all(vec == vec_storage[token])
Ejemplo n.º 10
0
def test_basic_load_all_vectors():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        vect.load_all()
        assert len(vect._vectors) == 4
        assert vect["."].shape == (3, )
        assert vect.token_to_vector(",").shape == (3, )
        assert np.allclose(a=vect["."],
                           b=BASIC_VECT_DATA_DICT["."],
                           rtol=0,
                           atol=1.0e-6)
        assert np.allclose(a=vect.token_to_vector(","),
                           b=BASIC_VECT_DATA_DICT[","],
                           rtol=0,
                           atol=1.0e-6)
Ejemplo n.º 11
0
def test_basic_cache_vocab():
    with tempfile.TemporaryDirectory() as base:
        with create_temp_vect_file(vect_file_name="vect1",
                                   file_data=BASIC_VECT_DATA,
                                   base_dir=base) as vect_file_path:
            assert os.path.exists(vect_file_path)
            cache_path = os.path.join(base, "cache.t")
            assert not os.path.exists(cache_path)
            vect = vectorizer.WordVectors(path=vect_file_path,
                                          cache_path=cache_path)

            vocab = [".", ":", ","]
            vect.load_vocab(vocab)

            assert os.path.exists(cache_path)

            with open(cache_path, "rb") as cache_file:
                content = cache_file.readlines()
                assert len(content) == 3
Ejemplo n.º 12
0
def test_basic_cache_max_vectors(tmpdir):
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA,
                               base_dir=tmpdir) as vect_file_path:
        assert os.path.exists(vect_file_path)
        cache_path = os.path.join(tmpdir, "cache.t")
        assert not os.path.exists(cache_path)
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      max_vectors=2,
                                      cache_path=cache_path)
        vect.load_all()
        assert os.path.exists(cache_path)
        with open(cache_path, "rb") as cache_file:
            content = cache_file.readlines()
            assert len(content) == 2
            first_line_parts = content[0].split(b" ")
            word, values = first_line_parts[0], first_line_parts[1:]
            assert word == b"."
            assert len(values) == 3
Ejemplo n.º 13
0
def test_basic_load_vocab():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path,
                                      default_vector_function=None)
        vocab = [".", ":"]
        vect.load_vocab(vocab=vocab)
        assert len(vect._vectors) == 2
        assert vect["."].shape == (3, )
        assert vect.token_to_vector(":").shape == (3, )
        assert np.allclose(a=vect[":"],
                           b=BASIC_VECT_DATA_DICT[":"],
                           rtol=0,
                           atol=1.0e-6)
        assert np.allclose(a=vect.token_to_vector("."),
                           b=BASIC_VECT_DATA_DICT["."],
                           rtol=0,
                           atol=1.0e-6)

        with pytest.raises(KeyError):
            vect[","]
        with pytest.raises(KeyError):
            vect.token_to_vector(",")
Ejemplo n.º 14
0
def test_basic_diff_dimensions():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=DIFF_DIM_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect.load_all()
Ejemplo n.º 15
0
def test_basic_load_vocab_none():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(ValueError):
            vect.load_vocab(vocab=None)
Ejemplo n.º 16
0
def test_get_vector_dim_not_initialized_vector_storage():
    with create_temp_vect_file(vect_file_name="vect1",
                               file_data=BASIC_VECT_DATA) as vect_file_path:
        vect = vectorizer.WordVectors(path=vect_file_path)
        with pytest.raises(RuntimeError):
            vect.get_vector_dim()
Ejemplo n.º 17
0
def test_basic_both_paths_none():
    vect = vectorizer.WordVectors(path=None, cache_path=None)
    with pytest.raises(ValueError):
        vect.load_all()