Exemple #1
0
def test_token_cooccurrence_vectorizer_window_args():
    vectorizer_a = TokenCooccurrenceVectorizer(window_functions="variable")
    vectorizer_b = TokenCooccurrenceVectorizer(
        window_functions="variable", window_args={"power": 0.75}
    )
    assert (
        vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data)
    ).nnz == 0
def test_token_cooccurrence_vectorizer_info_window():
    vectorizer = TokenCooccurrenceVectorizer(window_function="information")
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_max_freq():
    vectorizer = TokenCooccurrenceVectorizer(max_frequency=0.2)
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
Exemple #4
0
def test_token_cooccurrence_vectorizer_variable_window():
    vectorizer = TokenCooccurrenceVectorizer(window_functions="variable")
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    )
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
Exemple #5
0
def test_cooccurrence_vectorizer_epsilon():
    vectorizer_a = TokenCooccurrenceVectorizer(epsilon=0)
    vectorizer_b = TokenCooccurrenceVectorizer(epsilon=1e-11)
    vectorizer_c = TokenCooccurrenceVectorizer(epsilon=1)
    mat1 = normalize(
        vectorizer_a.fit_transform(token_data).toarray(), axis=0, norm="l1"
    )
    mat2 = vectorizer_b.fit_transform(token_data).toarray()
    assert np.allclose(mat1, mat2)
    assert vectorizer_c.fit_transform(token_data).nnz == 0
Exemple #6
0
def test_token_cooccurrence_vectorizer_fixed_tokens():
    vectorizer = TokenCooccurrenceVectorizer(token_dictionary={1: 0, 2: 1, 3: 2})
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    )
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_text():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data)
    assert scipy.sparse.issparse(result)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(text_token_data)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    assert result[1, 2] == 8
    assert result[0, 1] == 6
Exemple #8
0
def test_token_cooccurrence_vectorizer_kernel_args():
    vectorizer_a = TokenCooccurrenceVectorizer(
        kernel_functions="geometric",
        mask_string="MASK",
        kernel_args={"normalize": True},
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        kernel_functions="geometric",
        kernel_args={"normalize": True, "p": 0.9},
        mask_string="MASK",
    )
    assert (
        vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data)
    ).nnz == 0
Exemple #9
0
def test_cooccurrence_vectorizer_coo_mem_limit():
    vectorizer_a = TokenCooccurrenceVectorizer(
        window_functions="fixed",
        n_iter=0,
        coo_initial_memory="1k",
        normalize_windows=False,
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        window_functions="fixed",
        n_iter=0,
        normalize_windows=False,
    )
    np.random.seed(42)
    data = [[np.random.randint(0, 10) for i in range(100)]]
    mat1 = vectorizer_a.fit_transform(data).toarray()
    mat2 = vectorizer_b.fit_transform(data).toarray()
    assert np.allclose(mat1, mat2)
def test_equality_of_CooccurrenceVectorizers(
    min_token_occurrences,
    max_token_occurrences,
    min_document_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
    mask_string,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_tree_frequency=max_document_frequency,
        min_tree_occurrences=min_document_occurrences,
        mask_string=mask_string,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_document_frequency=max_document_frequency,
        min_document_occurrences=min_document_occurrences,
        mask_string=mask_string,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        tree_model.transform(seq_tree_sequence).toarray(),
    )
    assert np.allclose(
        seq_model.fit_transform(text_token_data_permutation).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.transform(seq_tree_sequence).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
Exemple #11
0
def test_reverse_cooccurrence_vectorizer():
    seq_model1 = TokenCooccurrenceVectorizer(
        window_radii=2,
        window_orientations="after",
        kernel_functions="harmonic",
        mask_string=None,
        normalize_windows=False,
    )
    seq_model2 = TokenCooccurrenceVectorizer(
        window_radii=2,
        window_orientations="before",
        kernel_functions="harmonic",
        mask_string=None,
        normalize_windows=False,
    )
    reversed_after = (seq_model1.fit_transform(text_token_data).toarray().T,)
    before = (seq_model2.fit_transform(text_token_data).toarray(),)
    assert np.allclose(reversed_after, before)
Exemple #12
0
def test_token_cooccurrence_vectorizer_offset(kernel_function):
    vectorizer_a = TokenCooccurrenceVectorizer(
        kernel_functions=kernel_function, window_radii=1, normalize_windows=False
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        kernel_functions=kernel_function, window_radii=2, normalize_windows=False
    )
    vectorizer_c = TokenCooccurrenceVectorizer(
        window_radii=2,
        kernel_functions=kernel_function,
        kernel_args={"offset": 1},
        normalize_windows=False,
    )
    mat1 = (
        vectorizer_a.fit_transform(token_data) + vectorizer_c.fit_transform(token_data)
    ).toarray()
    mat2 = vectorizer_b.fit_transform(token_data).toarray()
    assert np.allclose(mat1, mat2)
Exemple #13
0
def test_equality_of_CooccurrenceVectorizers(
    n_iter,
    normalize_windows,
    kernel_function,
    n_threads,
):
    window_radius = [1, 3]
    window_function = ["fixed", "variable"]

    model1 = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        n_iter=n_iter,
        kernel_functions=kernel_function,
        window_functions=window_function,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model2 = TimedTokenCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model3 = MultiSetCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    base_result = model1.fit_transform(tiny_token_data).toarray()
    assert np.allclose(
        base_result,
        model2.fit_transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.fit_transform(tiny_multi_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model1.transform(tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model2.transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.transform(tiny_multi_token_data).toarray(),
    )
Exemple #14
0
def test_equality_of_Tree_and_Token_CooccurrenceVectorizers(
    min_token_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
    mask_string,
    nullify_mask,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_tree_frequency=max_document_frequency,
        mask_string=mask_string,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        window_orientations=window_orientation,
        kernel_functions=kernel_function,
        min_occurrences=min_token_occurrences,
        max_document_frequency=max_document_frequency,
        mask_string=mask_string,
        normalize_windows=False,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        tree_model.transform(seq_tree_sequence).toarray(),
    )
    assert np.allclose(
        seq_model.fit_transform(text_token_data_permutation).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
def test_token_cooccurrence_vectorizer_orientation():
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="directional")
    result = vectorizer.fit_transform(text_token_data)
    assert result.shape == (4, 8)
    # Check the pok preceded by wer value is 1
    row = vectorizer.token_label_dictionary_["pok"]
    col = vectorizer.column_label_dictionary_["pre_wer"]
    assert result[row, col] == 1
    result_before = TokenCooccurrenceVectorizer(
        window_orientation="before").fit_transform(text_token_data)
    result_after = TokenCooccurrenceVectorizer(
        window_orientation="after").fit_transform(text_token_data)
    assert np.all(
        result_after.toarray() == (result_before.transpose()).toarray())
    result_symmetric = TokenCooccurrenceVectorizer(
        window_orientation="symmetric").fit_transform(text_token_data)
    assert np.all(result_symmetric.toarray() == (result_before +
                                                 result_after).toarray())
Exemple #16
0
def test_token_cooccurrence_vectorizer_orientation():
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="directional", normalize_windows=False
    )
    result = vectorizer.fit_transform(text_token_data)
    assert result.shape == (4, 8)
    # Check the pok preceded by wer value is 1
    row = vectorizer.token_label_dictionary_["pok"]
    col = vectorizer.column_label_dictionary_["pre_0_wer"]
    assert result[row, col] == 1
    result_before = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="before", normalize_windows=False
    ).fit_transform(text_token_data)
    result_after = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    ).fit_transform(text_token_data)
    assert np.all(result_after.toarray() == (result_before.transpose()).toarray())
    assert np.all(
        result.toarray() == np.hstack([result_before.toarray(), result_after.toarray()])
    )
Exemple #17
0
def test_token_cooccurrence_vectorizer_excessive_prune():
    vectorizer = TokenCooccurrenceVectorizer(min_frequency=1.0)
    with pytest.raises(ValueError):
        result = vectorizer.fit_transform(token_data)
Exemple #18
0
def test_token_cooccurence_vectorizer_transform_new_vocab():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data_new_token)
    assert (result != transform).nnz == 0
Exemple #19
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34
Exemple #20
0
def test_token_cooccurrence_vectorizer_mixed():
    vectorizer = TokenCooccurrenceVectorizer()
    with pytest.raises(ValueError):
        vectorizer.fit_transform(mixed_token_data)
Exemple #21
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34