Beispiel #1
0
def test_token_cooccurrence_vectorizer_column_order():
    vectorizer = TokenCooccurrenceVectorizer().fit(text_token_data)
    vectorizer_permuted = TokenCooccurrenceVectorizer().fit(text_token_data_permutation)
    assert (
        vectorizer.token_label_dictionary_
        == vectorizer_permuted.token_label_dictionary_
    )
Beispiel #2
0
def test_equality_of_CooccurrenceVectorizers(
    n_iter,
    normalize_windows,
    kernel_function,
    n_threads,
):
    window_radius = [1, 3]
    window_function = ["fixed", "variable"]

    model1 = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        n_iter=n_iter,
        kernel_functions=kernel_function,
        window_functions=window_function,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model2 = TimedTokenCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model3 = MultiSetCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    base_result = model1.fit_transform(tiny_token_data).toarray()
    assert np.allclose(
        base_result,
        model2.fit_transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.fit_transform(tiny_multi_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model1.transform(tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model2.transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.transform(tiny_multi_token_data).toarray(),
    )
Beispiel #3
0
def test_cooccurrence_vectorizer_epsilon():
    vectorizer_a = TokenCooccurrenceVectorizer(epsilon=0)
    vectorizer_b = TokenCooccurrenceVectorizer(epsilon=1e-11)
    vectorizer_c = TokenCooccurrenceVectorizer(epsilon=1)
    mat1 = normalize(
        vectorizer_a.fit_transform(token_data).toarray(), axis=0, norm="l1"
    )
    mat2 = vectorizer_b.fit_transform(token_data).toarray()
    assert np.allclose(mat1, mat2)
    assert vectorizer_c.fit_transform(token_data).nnz == 0
def test_token_cooccurrence_vectorizer_text():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data)
    assert scipy.sparse.issparse(result)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(text_token_data)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    assert result[1, 2] == 8
    assert result[0, 1] == 6
Beispiel #5
0
def test_equality_of_Tree_and_Token_CooccurrenceVectorizers(
    min_token_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
    mask_string,
    nullify_mask,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_tree_frequency=max_document_frequency,
        mask_string=mask_string,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        window_orientations=window_orientation,
        kernel_functions=kernel_function,
        min_occurrences=min_token_occurrences,
        max_document_frequency=max_document_frequency,
        mask_string=mask_string,
        normalize_windows=False,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        tree_model.transform(seq_tree_sequence).toarray(),
    )
    assert np.allclose(
        seq_model.fit_transform(text_token_data_permutation).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
Beispiel #6
0
def test_token_cooccurrence_vectorizer_offset(kernel_function):
    vectorizer_a = TokenCooccurrenceVectorizer(
        kernel_functions=kernel_function, window_radii=1, normalize_windows=False
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        kernel_functions=kernel_function, window_radii=2, normalize_windows=False
    )
    vectorizer_c = TokenCooccurrenceVectorizer(
        window_radii=2,
        kernel_functions=kernel_function,
        kernel_args={"offset": 1},
        normalize_windows=False,
    )
    mat1 = (
        vectorizer_a.fit_transform(token_data) + vectorizer_c.fit_transform(token_data)
    ).toarray()
    mat2 = vectorizer_b.fit_transform(token_data).toarray()
    assert np.allclose(mat1, mat2)
Beispiel #7
0
def test_token_cooccurrence_vectorizer_window_args():
    vectorizer_a = TokenCooccurrenceVectorizer(window_functions="variable")
    vectorizer_b = TokenCooccurrenceVectorizer(
        window_functions="variable", window_args={"power": 0.75}
    )
    assert (
        vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data)
    ).nnz == 0
def test_token_cooccurrence_vectorizer_info_window():
    vectorizer = TokenCooccurrenceVectorizer(window_function="information")
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_max_freq():
    vectorizer = TokenCooccurrenceVectorizer(max_frequency=0.2)
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
Beispiel #10
0
def test_equality_of_CooccurrenceVectorizers(
    min_token_occurrences,
    max_token_occurrences,
    min_document_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_tree_frequency=max_document_frequency,
        min_tree_occurrences=min_document_occurrences,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_document_frequency=max_document_frequency,
        min_document_occurrences=min_document_occurrences,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )

    assert np.allclose(
        tree_model.transform(seq_tree_sequence).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
Beispiel #11
0
def test_token_cooccurrence_vectorizer_variable_window():
    vectorizer = TokenCooccurrenceVectorizer(window_functions="variable")
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    )
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
Beispiel #12
0
def test_token_cooccurrence_vectorizer_fixed_tokens():
    vectorizer = TokenCooccurrenceVectorizer(token_dictionary={1: 0, 2: 1, 3: 2})
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    )
    result = vectorizer.fit_transform(token_data)
    assert result[0, 2] == 8
    assert result[1, 0] == 6
Beispiel #13
0
def test_token_cooccurrence_vectorizer_kernel_args():
    vectorizer_a = TokenCooccurrenceVectorizer(
        kernel_functions="geometric",
        mask_string="MASK",
        kernel_args={"normalize": True},
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        kernel_functions="geometric",
        kernel_args={"normalize": True, "p": 0.9},
        mask_string="MASK",
    )
    assert (
        vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data)
    ).nnz == 0
Beispiel #14
0
def test_cooccurrence_vectorizer_coo_mem_limit():
    vectorizer_a = TokenCooccurrenceVectorizer(
        window_functions="fixed",
        n_iter=0,
        coo_initial_memory="1k",
        normalize_windows=False,
    )
    vectorizer_b = TokenCooccurrenceVectorizer(
        window_functions="fixed",
        n_iter=0,
        normalize_windows=False,
    )
    np.random.seed(42)
    data = [[np.random.randint(0, 10) for i in range(100)]]
    mat1 = vectorizer_a.fit_transform(data).toarray()
    mat2 = vectorizer_b.fit_transform(data).toarray()
    assert np.allclose(mat1, mat2)
Beispiel #15
0
def test_reverse_cooccurrence_vectorizer():
    seq_model1 = TokenCooccurrenceVectorizer(
        window_radii=2,
        window_orientations="after",
        kernel_functions="harmonic",
        mask_string=None,
        normalize_windows=False,
    )
    seq_model2 = TokenCooccurrenceVectorizer(
        window_radii=2,
        window_orientations="before",
        kernel_functions="harmonic",
        mask_string=None,
        normalize_windows=False,
    )
    reversed_after = (seq_model1.fit_transform(text_token_data).toarray().T,)
    before = (seq_model2.fit_transform(text_token_data).toarray(),)
    assert np.allclose(reversed_after, before)
Beispiel #16
0
def test_token_cooccurrence_vectorizer_max_unique_tokens():
    vectorizer = TokenCooccurrenceVectorizer(max_unique_tokens=2)
    vectorizer.fit(token_data)
    assert vectorizer.token_label_dictionary_ == {1: 0, 2: 1}
Beispiel #17
0
def test_token_cooccurrence_vectorizer_excessive_prune():
    vectorizer = TokenCooccurrenceVectorizer(min_frequency=1.0)
    with pytest.raises(ValueError):
        result = vectorizer.fit_transform(token_data)
Beispiel #18
0
def test_token_cooccurrence_vectorizer_orientation():
    vectorizer = TokenCooccurrenceVectorizer(
        window_radius=1, window_orientation="directional"
    )
    result = vectorizer.fit_transform(text_token_data)
    assert result.shape == (4, 8)
    # Check the pok preceded by wer value is 1
    row = vectorizer.token_label_dictionary_["pok"]
    col = vectorizer.column_label_dictionary_["pre_wer"]
    assert result[row, col] == 1
    result_before = TokenCooccurrenceVectorizer(
        window_orientation="before"
    ).fit_transform(text_token_data)
    result_after = TokenCooccurrenceVectorizer(
        window_orientation="after"
    ).fit_transform(text_token_data)
    assert np.all(result_after.toarray() == (result_before.transpose()).toarray())
    result_symmetric = TokenCooccurrenceVectorizer(
        window_orientation="symmetric"
    ).fit_transform(text_token_data)
    assert np.all(
        result_symmetric.toarray() == (result_before + result_after).toarray()
    )
Beispiel #19
0
def test_token_cooccurence_vectorizer_transform_new_vocab():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data_new_token)
    assert (result != transform).nnz == 0
Beispiel #20
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34
Beispiel #21
0
def test_token_cooccurrence_vectorizer_orientation():
    vectorizer = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="directional", normalize_windows=False
    )
    result = vectorizer.fit_transform(text_token_data)
    assert result.shape == (4, 8)
    # Check the pok preceded by wer value is 1
    row = vectorizer.token_label_dictionary_["pok"]
    col = vectorizer.column_label_dictionary_["pre_0_wer"]
    assert result[row, col] == 1
    result_before = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="before", normalize_windows=False
    ).fit_transform(text_token_data)
    result_after = TokenCooccurrenceVectorizer(
        window_radii=1, window_orientations="after", normalize_windows=False
    ).fit_transform(text_token_data)
    assert np.all(result_after.toarray() == (result_before.transpose()).toarray())
    assert np.all(
        result.toarray() == np.hstack([result_before.toarray(), result_after.toarray()])
    )
Beispiel #22
0
def test_token_cooccurrence_vectorizer_mixed():
    vectorizer = TokenCooccurrenceVectorizer()
    with pytest.raises(ValueError):
        vectorizer.fit_transform(mixed_token_data)
Beispiel #23
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34