def test_compute_token_relevance_matrix():

    #  Arrange
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()

    beta = config.getBeta(config_default.BETA_DEFAULT)
    vlambda = config.getLambda(config_default.LAMBDA_DEFAULT)
    type_topic_counts = pypclda.get_token_topic_matrix(sampler)
    n_types = len(type_topic_counts)
    n_topics = len(type_topic_counts[0])

    # Act
    token_relevance_matrix_python = pypclda.compute_token_relevance_matrix(
        type_topic_counts, beta, vlambda)

    # Assert
    token_relevance_matrix_java = __java_compute_token_relevance_matrix(
        n_types, n_topics, type_topic_counts, beta, vlambda)
    token_relevance_matrix_java = np.array(
        [list(x) for x in token_relevance_matrix_java])

    assert np.allclose(token_relevance_matrix_java,
                       token_relevance_matrix_python,
                       rtol=1e-10)
def test_get_top_relevance_topic_tokens2():
    """Tests a "port" of cc.mallet.util.LDAUtils.getTopRelevanceWords
    """
    #  Arrange
    n_top_tokens = 20
    sampler = fixture.fixture_sampler()
    config = fixture.fixture_lda_config()

    # Act
    top_token_relevance_python = pypclda.get_top_relevance_topic_tokens2(
        sampler, config, n_top_tokens)

    # Assert
    type_topic_count_matrix = sampler.getTypeTopicMatrix()
    java_words = cc.mallet.util.LDAUtils().getTopRelevanceWords(
        n_top_tokens, len(type_topic_count_matrix),
        len(type_topic_count_matrix[0]), type_topic_count_matrix,
        config.getBeta(config_default.BETA_DEFAULT),
        config.getLambda(config_default.LAMBDA_DEFAULT), sampler.getAlphabet())

    python_words = [[w[0] for w in row] for row in top_token_relevance_python]
    java_words = [list(x) for x in java_words]

    assert len(python_words) == len(java_words)
    assert len(python_words[0]) == len(java_words[0])
def test_load_lda_sampler():

    expected_sampler_type = "cc.mallet.topics.PolyaUrnSpaliasLDA"

    config = fixture.fixture_lda_config()
    sampler_folder = str(config.getSavedSamplerDirectory(""))

    sampler = pypclda.load_lda_sampler(
        config, stored_dir=config.getSavedSamplerDirectory(""))

    assert sampler is not None
    assert expected_sampler_type == sampler.getClass().getName()
def test_sample_pclda():

    sampler_type = "cc.mallet.topics.PolyaUrnSpaliasLDA"

    config = fixture.fixture_lda_config()
    dataset = fixture.fixture_dataset(config)

    sampler = pypclda.sample_pclda(config,
                                   dataset,
                                   iterations=2000,
                                   sampler_type=sampler_type,
                                   testset=None,
                                   save_sampler=True)

    assert sampler is not None
def test_get_top_relevance_topic_tokens():
    """Tests call to cc.mallet.util.LDAUtils.getTopRelevanceWords
    TODO: fix equality test of word (different sort order when value the same)
    """
    n_top_words = 20
    sampler = fixture.fixture_sampler()
    config = fixture.fixture_lda_config()

    relevances = pypclda.get_top_topic_word_relevances(sampler,
                                                       config,
                                                       n_top_words=n_top_words)

    assert relevances is not None
    assert int(sampler.getNoTopics()) == len(relevances)
    assert n_top_words == len(relevances[0])
    assert relevances is not None
def test_compute_token_probabilities_given_topic():
    #  Arrange
    lda_util = cc.mallet.util.LDAUtils()
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()
    beta = config.getBeta(config_default.BETA_DEFAULT)
    type_topic_counts = pypclda.get_token_topic_matrix(sampler)

    # Act
    word_probs_python = pypclda.compute_token_probabilities_given_topic(
        type_topic_counts, beta)

    # Assert
    word_probs_java = lda_util.calcWordProbGivenTopic(type_topic_counts, beta)

    assert np.allclose(word_probs_java, word_probs_python, rtol=1e-05)
    """
def test_compute_distinctiveness_matrix():

    #  Arrange
    config = fixture.fixture_lda_config()
    sampler = fixture.fixture_sampler()
    token_topic_count_matrix = sampler.getTypeTopicMatrix()

    beta = config.getBeta(config_default.BETA_DEFAULT)
    p_w_k = pypclda.compute_token_probabilities_given_topic(
        token_topic_count_matrix, beta)
    p_w = pypclda.compute_token_probabilities(token_topic_count_matrix, beta)

    # Act
    python_matrix = pypclda.compute_distinctiveness_matrix(p_w_k, p_w)

    # Assert
    java_matrix = cc.mallet.util.LDAUtils.calcWordDistinctiveness(p_w_k, p_w)
    java_matrix = np.array([list(x) for x in java_matrix])

    assert np.allclose(java_matrix, python_matrix, rtol=1e-10)