def test_compute_token_relevance_matrix(): # Arrange config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() beta = config.getBeta(config_default.BETA_DEFAULT) vlambda = config.getLambda(config_default.LAMBDA_DEFAULT) type_topic_counts = pypclda.get_token_topic_matrix(sampler) n_types = len(type_topic_counts) n_topics = len(type_topic_counts[0]) # Act token_relevance_matrix_python = pypclda.compute_token_relevance_matrix( type_topic_counts, beta, vlambda) # Assert token_relevance_matrix_java = __java_compute_token_relevance_matrix( n_types, n_topics, type_topic_counts, beta, vlambda) token_relevance_matrix_java = np.array( [list(x) for x in token_relevance_matrix_java]) assert np.allclose(token_relevance_matrix_java, token_relevance_matrix_python, rtol=1e-10)
def test_get_top_relevance_topic_tokens2(): """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords """ # Arrange n_top_tokens = 20 sampler = fixture.fixture_sampler() config = fixture.fixture_lda_config() # Act top_token_relevance_python = pypclda.get_top_relevance_topic_tokens2( sampler, config, n_top_tokens) # Assert type_topic_count_matrix = sampler.getTypeTopicMatrix() java_words = cc.mallet.util.LDAUtils().getTopRelevanceWords( n_top_tokens, len(type_topic_count_matrix), len(type_topic_count_matrix[0]), type_topic_count_matrix, config.getBeta(config_default.BETA_DEFAULT), config.getLambda(config_default.LAMBDA_DEFAULT), sampler.getAlphabet()) python_words = [[w[0] for w in row] for row in top_token_relevance_python] java_words = [list(x) for x in java_words] assert len(python_words) == len(java_words) assert len(python_words[0]) == len(java_words[0])
def test_get_alphabet(): sampler = fixture.fixture_sampler() alphabet = pypclda.get_alphabet(sampler) assert alphabet is not None assert alphabet.size() == 982
def test_extract_id2token(): sampler = fixture.fixture_sampler() alphabet = sampler.getAlphabet() id2token = pypclda.extract_vocabulary(alphabet) assert all( (id2token[i] == str(w) for i, w in enumerate(alphabet.toArray())))
def test_extract_vocabulary(): sampler = fixture.fixture_sampler() alphabet = sampler.getAlphabet() vocabulary = pypclda.extract_vocabulary(alphabet) assert vocabulary is not None assert 982 == len(vocabulary) assert 982 == len(set(vocabulary))
def test_extract_doc_lengths(): expected_doc_count = 10 expected_max_doc_length = 685 sampler = fixture.fixture_sampler() doc_lengths = pypclda.extract_doc_lengths(sampler.getDataset()) assert doc_lengths is not None assert expected_doc_count == len(doc_lengths) assert expected_max_doc_length == max(doc_lengths)
def test_extract_token_counts(): expected_token_count = 982 expected_max_count = 157 sampler = fixture.fixture_sampler() token_counts = pypclda.extract_token_counts(sampler.getDataset()) assert token_counts is not None assert expected_token_count == len(token_counts) assert expected_max_count == max(token_counts)
def test_get_document_topic_matrix(): expected_document_count = 10 expected_topic_count = 20 expected_max_count = 493 sampler = fixture.fixture_sampler() document_topic_matrix = pypclda.get_document_topic_matrix(sampler) assert document_topic_matrix is not None assert expected_document_count == len(document_topic_matrix) assert expected_topic_count == len(document_topic_matrix[0])
def test_get_token_topic_matrix(): expected_token_count = 982 expected_topic_count = 20 expected_max_count = 157 sampler = fixture.fixture_sampler() token_topic_matrix = pypclda.get_token_topic_matrix(sampler) assert token_topic_matrix is not None assert expected_token_count == len(token_topic_matrix) assert expected_topic_count == len(token_topic_matrix[0]) assert expected_max_count == max([max(x) for x in token_topic_matrix])
def test_get_top_topic_tokens(): expected_token_count = 30 expected_topic_count = 20 sampler = fixture.fixture_sampler() top_topic_words = pypclda.get_top_topic_tokens(sampler, expected_token_count) assert top_topic_words is not None assert expected_topic_count == len(top_topic_words) for t in top_topic_words: assert expected_token_count == len(t)
def test_get_top_relevance_topic_tokens(): """Tests call to cc.mallet.util.LDAUtils.getTopRelevanceWords TODO: fix equality test of word (different sort order when value the same) """ n_top_words = 20 sampler = fixture.fixture_sampler() config = fixture.fixture_lda_config() relevances = pypclda.get_top_topic_word_relevances(sampler, config, n_top_words=n_top_words) assert relevances is not None assert int(sampler.getNoTopics()) == len(relevances) assert n_top_words == len(relevances[0]) assert relevances is not None
def test_compute_token_probabilities_given_topic(): # Arrange lda_util = cc.mallet.util.LDAUtils() config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() beta = config.getBeta(config_default.BETA_DEFAULT) type_topic_counts = pypclda.get_token_topic_matrix(sampler) # Act word_probs_python = pypclda.compute_token_probabilities_given_topic( type_topic_counts, beta) # Assert word_probs_java = lda_util.calcWordProbGivenTopic(type_topic_counts, beta) assert np.allclose(word_probs_java, word_probs_python, rtol=1e-05) """
def test_get_topic_token_phi_matrix(): expected_token_count = 982 expected_topic_count = 20 expected_max_phi = 1.0 sampler = fixture.fixture_sampler() topic_token_phi_matrix = pypclda.get_topic_token_phi_matrix(sampler) assert topic_token_phi_matrix is not None assert expected_topic_count == len(topic_token_phi_matrix) assert expected_token_count == len(topic_token_phi_matrix[0]) assert expected_max_phi >= max([max(x) for x in topic_token_phi_matrix]) for t in range(0, expected_topic_count): assert math.isclose(1.0, sum(topic_token_phi_matrix[t]), rel_tol=1e-5)
def test_compute_distinctiveness_matrix(): # Arrange config = fixture.fixture_lda_config() sampler = fixture.fixture_sampler() token_topic_count_matrix = sampler.getTypeTopicMatrix() beta = config.getBeta(config_default.BETA_DEFAULT) p_w_k = pypclda.compute_token_probabilities_given_topic( token_topic_count_matrix, beta) p_w = pypclda.compute_token_probabilities(token_topic_count_matrix, beta) # Act python_matrix = pypclda.compute_distinctiveness_matrix(p_w_k, p_w) # Assert java_matrix = cc.mallet.util.LDAUtils.calcWordDistinctiveness(p_w_k, p_w) java_matrix = np.array([list(x) for x in java_matrix]) assert np.allclose(java_matrix, python_matrix, rtol=1e-10)
def test_get_top_topic_tokens2(): """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords """ # Arrange n_top_tokens = 20 sampler = fixture.fixture_sampler() # Act top_tokens_python = pypclda.get_top_topic_tokens2(sampler, n_top_tokens) # Assert type_topic_counts = sampler.getTypeTopicMatrix() top_tokens_java = cc.mallet.util.LDAUtils().getTopWords( n_top_tokens, len(sampler.getTypeTopicMatrix()), len(sampler.getTypeTopicMatrix()[0]), sampler.getTypeTopicMatrix(), sampler.getAlphabet()) python_words = [[w[0] for w in row] for row in top_tokens_python] java_words = [list(x) for x in top_tokens_java] assert len(python_words) == len(java_words) assert len(python_words[0]) == len(java_words[0])