def test_compute_token_relevance_matrix():

    #  Arrange
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()

    beta = config.getBeta(config_default.BETA_DEFAULT)
    vlambda = config.getLambda(config_default.LAMBDA_DEFAULT)
    type_topic_counts = pypclda.get_token_topic_matrix(sampler)
    n_types = len(type_topic_counts)
    n_topics = len(type_topic_counts[0])

    # Act
    token_relevance_matrix_python = pypclda.compute_token_relevance_matrix(
        type_topic_counts, beta, vlambda)

    # Assert
    token_relevance_matrix_java = __java_compute_token_relevance_matrix(
        n_types, n_topics, type_topic_counts, beta, vlambda)
    token_relevance_matrix_java = np.array(
        [list(x) for x in token_relevance_matrix_java])

    assert np.allclose(token_relevance_matrix_java,
                       token_relevance_matrix_python,
                       rtol=1e-10)
def test_get_top_relevance_topic_tokens2():
    """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords
    """
    #  Arrange
    n_top_tokens = 20
    sampler = fixture.fixture_sampler()
    config = fixture.fixture_lda_config()

    # Act
    top_token_relevance_python = pypclda.get_top_relevance_topic_tokens2(
        sampler, config, n_top_tokens)

    # Assert
    type_topic_count_matrix = sampler.getTypeTopicMatrix()
    java_words = cc.mallet.util.LDAUtils().getTopRelevanceWords(
        n_top_tokens, len(type_topic_count_matrix),
        len(type_topic_count_matrix[0]), type_topic_count_matrix,
        config.getBeta(config_default.BETA_DEFAULT),
        config.getLambda(config_default.LAMBDA_DEFAULT), sampler.getAlphabet())

    python_words = [[w[0] for w in row] for row in top_token_relevance_python]
    java_words = [list(x) for x in java_words]

    assert len(python_words) == len(java_words)
    assert len(python_words[0]) == len(java_words[0])
def test_get_alphabet():

    sampler = fixture.fixture_sampler()

    alphabet = pypclda.get_alphabet(sampler)

    assert alphabet is not None
    assert alphabet.size() == 982
def test_extract_id2token():

    sampler = fixture.fixture_sampler()
    alphabet = sampler.getAlphabet()

    id2token = pypclda.extract_vocabulary(alphabet)

    assert all(
        (id2token[i] == str(w) for i, w in enumerate(alphabet.toArray())))
def test_extract_vocabulary():

    sampler = fixture.fixture_sampler()
    alphabet = sampler.getAlphabet()

    vocabulary = pypclda.extract_vocabulary(alphabet)

    assert vocabulary is not None
    assert 982 == len(vocabulary)
    assert 982 == len(set(vocabulary))
def test_extract_doc_lengths():

    expected_doc_count = 10
    expected_max_doc_length = 685

    sampler = fixture.fixture_sampler()

    doc_lengths = pypclda.extract_doc_lengths(sampler.getDataset())

    assert doc_lengths is not None
    assert expected_doc_count == len(doc_lengths)
    assert expected_max_doc_length == max(doc_lengths)
def test_extract_token_counts():

    expected_token_count = 982
    expected_max_count = 157

    sampler = fixture.fixture_sampler()

    token_counts = pypclda.extract_token_counts(sampler.getDataset())

    assert token_counts is not None
    assert expected_token_count == len(token_counts)
    assert expected_max_count == max(token_counts)
def test_get_document_topic_matrix():

    expected_document_count = 10
    expected_topic_count = 20
    expected_max_count = 493

    sampler = fixture.fixture_sampler()

    document_topic_matrix = pypclda.get_document_topic_matrix(sampler)

    assert document_topic_matrix is not None
    assert expected_document_count == len(document_topic_matrix)
    assert expected_topic_count == len(document_topic_matrix[0])
def test_get_token_topic_matrix():

    expected_token_count = 982
    expected_topic_count = 20
    expected_max_count = 157

    sampler = fixture.fixture_sampler()

    token_topic_matrix = pypclda.get_token_topic_matrix(sampler)

    assert token_topic_matrix is not None
    assert expected_token_count == len(token_topic_matrix)
    assert expected_topic_count == len(token_topic_matrix[0])
    assert expected_max_count == max([max(x) for x in token_topic_matrix])
def test_get_top_topic_tokens():

    expected_token_count = 30
    expected_topic_count = 20

    sampler = fixture.fixture_sampler()

    top_topic_words = pypclda.get_top_topic_tokens(sampler,
                                                   expected_token_count)

    assert top_topic_words is not None
    assert expected_topic_count == len(top_topic_words)

    for t in top_topic_words:
        assert expected_token_count == len(t)
def test_get_top_relevance_topic_tokens():
    """Tests call to cc.mallet.util.LDAUtils.getTopRelevanceWords
    TODO: fix equality test of word (different sort order when value the same)
    """
    n_top_words = 20
    sampler = fixture.fixture_sampler()
    config = fixture.fixture_lda_config()

    relevances = pypclda.get_top_topic_word_relevances(sampler,
                                                       config,
                                                       n_top_words=n_top_words)

    assert relevances is not None
    assert int(sampler.getNoTopics()) == len(relevances)
    assert n_top_words == len(relevances[0])
    assert relevances is not None
def test_compute_token_probabilities_given_topic():
    #  Arrange
    lda_util = cc.mallet.util.LDAUtils()
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()
    beta = config.getBeta(config_default.BETA_DEFAULT)
    type_topic_counts = pypclda.get_token_topic_matrix(sampler)

    # Act
    word_probs_python = pypclda.compute_token_probabilities_given_topic(
        type_topic_counts, beta)

    # Assert
    word_probs_java = lda_util.calcWordProbGivenTopic(type_topic_counts, beta)

    assert np.allclose(word_probs_java, word_probs_python, rtol=1e-05)
    """
def test_get_topic_token_phi_matrix():

    expected_token_count = 982
    expected_topic_count = 20
    expected_max_phi = 1.0

    sampler = fixture.fixture_sampler()

    topic_token_phi_matrix = pypclda.get_topic_token_phi_matrix(sampler)

    assert topic_token_phi_matrix is not None
    assert expected_topic_count == len(topic_token_phi_matrix)
    assert expected_token_count == len(topic_token_phi_matrix[0])
    assert expected_max_phi >= max([max(x) for x in topic_token_phi_matrix])

    for t in range(0, expected_topic_count):
        assert math.isclose(1.0, sum(topic_token_phi_matrix[t]), rel_tol=1e-5)
def test_compute_distinctiveness_matrix():

    #  Arrange
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()
    token_topic_count_matrix = sampler.getTypeTopicMatrix()

    beta = config.getBeta(config_default.BETA_DEFAULT)
    p_w_k = pypclda.compute_token_probabilities_given_topic(
        token_topic_count_matrix, beta)
    p_w = pypclda.compute_token_probabilities(token_topic_count_matrix, beta)

    # Act
    python_matrix = pypclda.compute_distinctiveness_matrix(p_w_k, p_w)

    # Assert
    java_matrix = cc.mallet.util.LDAUtils.calcWordDistinctiveness(p_w_k, p_w)
    java_matrix = np.array([list(x) for x in java_matrix])

    assert np.allclose(java_matrix, python_matrix, rtol=1e-10)
def test_get_top_topic_tokens2():
    """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords
    """
    #  Arrange
    n_top_tokens = 20

    sampler = fixture.fixture_sampler()

    # Act
    top_tokens_python = pypclda.get_top_topic_tokens2(sampler, n_top_tokens)

    # Assert
    type_topic_counts = sampler.getTypeTopicMatrix()
    top_tokens_java = cc.mallet.util.LDAUtils().getTopWords(
        n_top_tokens, len(sampler.getTypeTopicMatrix()),
        len(sampler.getTypeTopicMatrix()[0]), sampler.getTypeTopicMatrix(),
        sampler.getAlphabet())

    python_words = [[w[0] for w in row] for row in top_tokens_python]
    java_words = [list(x) for x in top_tokens_java]

    assert len(python_words) == len(java_words)
    assert len(python_words[0]) == len(java_words[0])