def test_token_cooccurrence_vectorizer_window_args(): vectorizer_a = TokenCooccurrenceVectorizer(window_functions="variable") vectorizer_b = TokenCooccurrenceVectorizer( window_functions="variable", window_args={"power": 0.75} ) assert ( vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data) ).nnz == 0
def test_token_cooccurrence_vectorizer_info_window(): vectorizer = TokenCooccurrenceVectorizer(window_function="information") result = vectorizer.fit_transform(token_data) assert scipy.sparse.issparse(result) vectorizer = TokenCooccurrenceVectorizer(window_radius=1, window_orientation="after") result = vectorizer.fit_transform(token_data) assert result[0, 2] == 8 assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_max_freq(): vectorizer = TokenCooccurrenceVectorizer(max_frequency=0.2) result = vectorizer.fit_transform(token_data) assert scipy.sparse.issparse(result) vectorizer = TokenCooccurrenceVectorizer(window_radius=1, window_orientation="after") result = vectorizer.fit_transform(token_data) assert result[0, 2] == 8 assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_variable_window(): vectorizer = TokenCooccurrenceVectorizer(window_functions="variable") result = vectorizer.fit_transform(token_data) assert scipy.sparse.issparse(result) vectorizer = TokenCooccurrenceVectorizer( window_radii=1, window_orientations="after", normalize_windows=False ) result = vectorizer.fit_transform(token_data) assert result[0, 2] == 8 assert result[1, 0] == 6
def test_cooccurrence_vectorizer_epsilon(): vectorizer_a = TokenCooccurrenceVectorizer(epsilon=0) vectorizer_b = TokenCooccurrenceVectorizer(epsilon=1e-11) vectorizer_c = TokenCooccurrenceVectorizer(epsilon=1) mat1 = normalize( vectorizer_a.fit_transform(token_data).toarray(), axis=0, norm="l1" ) mat2 = vectorizer_b.fit_transform(token_data).toarray() assert np.allclose(mat1, mat2) assert vectorizer_c.fit_transform(token_data).nnz == 0
def test_token_cooccurrence_vectorizer_fixed_tokens(): vectorizer = TokenCooccurrenceVectorizer(token_dictionary={1: 0, 2: 1, 3: 2}) result = vectorizer.fit_transform(token_data) assert scipy.sparse.issparse(result) vectorizer = TokenCooccurrenceVectorizer( window_radii=1, window_orientations="after", normalize_windows=False ) result = vectorizer.fit_transform(token_data) assert result[0, 2] == 8 assert result[1, 0] == 6
def test_token_cooccurrence_vectorizer_text(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data) assert scipy.sparse.issparse(result) transform = vectorizer.transform(text_token_data) assert (result != transform).nnz == 0 vectorizer = TokenCooccurrenceVectorizer(window_radius=1, window_orientation="after") result = vectorizer.fit_transform(text_token_data) transform = vectorizer.transform(text_token_data) assert (result != transform).nnz == 0 assert result[1, 2] == 8 assert result[0, 1] == 6
def test_token_cooccurrence_vectorizer_kernel_args(): vectorizer_a = TokenCooccurrenceVectorizer( kernel_functions="geometric", mask_string="MASK", kernel_args={"normalize": True}, ) vectorizer_b = TokenCooccurrenceVectorizer( kernel_functions="geometric", kernel_args={"normalize": True, "p": 0.9}, mask_string="MASK", ) assert ( vectorizer_a.fit_transform(token_data) != vectorizer_b.fit_transform(token_data) ).nnz == 0
def test_cooccurrence_vectorizer_coo_mem_limit(): vectorizer_a = TokenCooccurrenceVectorizer( window_functions="fixed", n_iter=0, coo_initial_memory="1k", normalize_windows=False, ) vectorizer_b = TokenCooccurrenceVectorizer( window_functions="fixed", n_iter=0, normalize_windows=False, ) np.random.seed(42) data = [[np.random.randint(0, 10) for i in range(100)]] mat1 = vectorizer_a.fit_transform(data).toarray() mat2 = vectorizer_b.fit_transform(data).toarray() assert np.allclose(mat1, mat2)
def test_equality_of_CooccurrenceVectorizers( min_token_occurrences, max_token_occurrences, min_document_occurrences, max_document_frequency, window_radius, window_orientation, kernel_function, mask_string, ): tree_model = LabelledTreeCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_occurrences=max_token_occurrences, max_tree_frequency=max_document_frequency, min_tree_occurrences=min_document_occurrences, mask_string=mask_string, ) seq_model = TokenCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_occurrences=max_token_occurrences, max_document_frequency=max_document_frequency, min_document_occurrences=min_document_occurrences, mask_string=mask_string, ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), seq_model.fit_transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), tree_model.transform(seq_tree_sequence).toarray(), ) assert np.allclose( seq_model.fit_transform(text_token_data_permutation).toarray(), seq_model.transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.transform(seq_tree_sequence).toarray(), seq_model.transform(text_token_data_permutation).toarray(), )
def test_reverse_cooccurrence_vectorizer(): seq_model1 = TokenCooccurrenceVectorizer( window_radii=2, window_orientations="after", kernel_functions="harmonic", mask_string=None, normalize_windows=False, ) seq_model2 = TokenCooccurrenceVectorizer( window_radii=2, window_orientations="before", kernel_functions="harmonic", mask_string=None, normalize_windows=False, ) reversed_after = (seq_model1.fit_transform(text_token_data).toarray().T,) before = (seq_model2.fit_transform(text_token_data).toarray(),) assert np.allclose(reversed_after, before)
def test_token_cooccurrence_vectorizer_offset(kernel_function): vectorizer_a = TokenCooccurrenceVectorizer( kernel_functions=kernel_function, window_radii=1, normalize_windows=False ) vectorizer_b = TokenCooccurrenceVectorizer( kernel_functions=kernel_function, window_radii=2, normalize_windows=False ) vectorizer_c = TokenCooccurrenceVectorizer( window_radii=2, kernel_functions=kernel_function, kernel_args={"offset": 1}, normalize_windows=False, ) mat1 = ( vectorizer_a.fit_transform(token_data) + vectorizer_c.fit_transform(token_data) ).toarray() mat2 = vectorizer_b.fit_transform(token_data).toarray() assert np.allclose(mat1, mat2)
def test_equality_of_CooccurrenceVectorizers( n_iter, normalize_windows, kernel_function, n_threads, ): window_radius = [1, 3] window_function = ["fixed", "variable"] model1 = TokenCooccurrenceVectorizer( window_radii=window_radius, n_iter=n_iter, kernel_functions=kernel_function, window_functions=window_function, normalize_windows=normalize_windows, n_threads=n_threads, ) model2 = TimedTokenCooccurrenceVectorizer( window_radii=window_radius, kernel_functions=kernel_function, window_functions=window_function, n_iter=n_iter, normalize_windows=normalize_windows, n_threads=n_threads, ) model3 = MultiSetCooccurrenceVectorizer( window_radii=window_radius, kernel_functions=kernel_function, window_functions=window_function, n_iter=n_iter, normalize_windows=normalize_windows, n_threads=n_threads, ) base_result = model1.fit_transform(tiny_token_data).toarray() assert np.allclose( base_result, model2.fit_transform(timed_tiny_token_data).toarray(), ) assert np.allclose( base_result, model3.fit_transform(tiny_multi_token_data).toarray(), ) assert np.allclose( base_result, model1.transform(tiny_token_data).toarray(), ) assert np.allclose( base_result, model2.transform(timed_tiny_token_data).toarray(), ) assert np.allclose( base_result, model3.transform(tiny_multi_token_data).toarray(), )
def test_equality_of_Tree_and_Token_CooccurrenceVectorizers( min_token_occurrences, max_document_frequency, window_radius, window_orientation, kernel_function, mask_string, nullify_mask, ): tree_model = LabelledTreeCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_tree_frequency=max_document_frequency, mask_string=mask_string, nullify_mask=nullify_mask and not mask_string is None, ) seq_model = TokenCooccurrenceVectorizer( window_radii=window_radius, window_orientations=window_orientation, kernel_functions=kernel_function, min_occurrences=min_token_occurrences, max_document_frequency=max_document_frequency, mask_string=mask_string, normalize_windows=False, nullify_mask=nullify_mask and not mask_string is None, ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), seq_model.fit_transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), tree_model.transform(seq_tree_sequence).toarray(), ) assert np.allclose( seq_model.fit_transform(text_token_data_permutation).toarray(), seq_model.transform(text_token_data_permutation).toarray(), )
def test_token_cooccurrence_vectorizer_orientation(): vectorizer = TokenCooccurrenceVectorizer(window_radius=1, window_orientation="directional") result = vectorizer.fit_transform(text_token_data) assert result.shape == (4, 8) # Check the pok preceded by wer value is 1 row = vectorizer.token_label_dictionary_["pok"] col = vectorizer.column_label_dictionary_["pre_wer"] assert result[row, col] == 1 result_before = TokenCooccurrenceVectorizer( window_orientation="before").fit_transform(text_token_data) result_after = TokenCooccurrenceVectorizer( window_orientation="after").fit_transform(text_token_data) assert np.all( result_after.toarray() == (result_before.transpose()).toarray()) result_symmetric = TokenCooccurrenceVectorizer( window_orientation="symmetric").fit_transform(text_token_data) assert np.all(result_symmetric.toarray() == (result_before + result_after).toarray())
def test_token_cooccurrence_vectorizer_orientation(): vectorizer = TokenCooccurrenceVectorizer( window_radii=1, window_orientations="directional", normalize_windows=False ) result = vectorizer.fit_transform(text_token_data) assert result.shape == (4, 8) # Check the pok preceded by wer value is 1 row = vectorizer.token_label_dictionary_["pok"] col = vectorizer.column_label_dictionary_["pre_0_wer"] assert result[row, col] == 1 result_before = TokenCooccurrenceVectorizer( window_radii=1, window_orientations="before", normalize_windows=False ).fit_transform(text_token_data) result_after = TokenCooccurrenceVectorizer( window_radii=1, window_orientations="after", normalize_windows=False ).fit_transform(text_token_data) assert np.all(result_after.toarray() == (result_before.transpose()).toarray()) assert np.all( result.toarray() == np.hstack([result_before.toarray(), result_after.toarray()]) )
def test_token_cooccurrence_vectorizer_excessive_prune(): vectorizer = TokenCooccurrenceVectorizer(min_frequency=1.0) with pytest.raises(ValueError): result = vectorizer.fit_transform(token_data)
def test_token_cooccurence_vectorizer_transform_new_vocab(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data_new_token) assert (result != transform).nnz == 0
def test_token_cooccurrence_vectorizer_transform(): vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric') result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data) assert result.shape == transform.shape assert transform[0, 0] == 34
def test_token_cooccurrence_vectorizer_mixed(): vectorizer = TokenCooccurrenceVectorizer() with pytest.raises(ValueError): vectorizer.fit_transform(mixed_token_data)
def test_token_cooccurrence_vectorizer_transform(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data) assert result.shape == transform.shape assert transform[0, 0] == 34