def test_generate_wordclouds_for_topic_words():
    try:
        import lda
        import PIL
        from wordcloud import WordCloud
    except ImportError:
        pytest.skip('at least one of lda, Pillow, wordcloud not installed')

    data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle')
    model = data['model']
    vocab = data['vocab']

    phi = model.topic_word_
    assert phi.shape == (5, len(vocab))

    topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10)
    assert len(topic_word_clouds) == 5
    assert set(topic_word_clouds.keys()) == set('topic_%d' % i for i in range(1, 6))
    assert all(isinstance(wc, PIL.Image.Image) for wc in topic_word_clouds.values())

    topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10,
                                                                      which_topics=('topic_1', 'topic_2'),
                                                                      return_images=False,
                                                                      width=640, height=480)
    assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'}
    assert all(isinstance(wc, WordCloud) for wc in topic_word_clouds.values())
    assert all(wc.width == 640 and wc.height == 480 for wc in topic_word_clouds.values())
def test_generate_wordclouds_for_document_topics():
    try:
        import lda
        import PIL
        from wordcloud import WordCloud
    except ImportError:
        pytest.skip('at least one of lda, Pillow, wordcloud not installed')

    data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle')
    model = data['model']
    doc_labels = data['doc_labels']

    theta = model.doc_topic_
    assert theta.shape == (len(doc_labels), 5)

    doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3)
    assert len(doc_topic_clouds) == len(doc_labels)
    assert set(doc_topic_clouds.keys()) == set(doc_labels)
    assert all(isinstance(wc, PIL.Image.Image) for wc in doc_topic_clouds.values())

    which_docs = doc_labels[:2]
    assert len(which_docs) == 2
    doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(theta, doc_labels, 3,
                                                                         which_documents=which_docs,
                                                                         return_images=False,
                                                                         width=640, height=480)
    assert set(doc_topic_clouds.keys()) == set(which_docs)
    assert all(isinstance(wc, WordCloud) for wc in doc_topic_clouds.values())
    assert all(wc.width == 640 and wc.height == 480 for wc in doc_topic_clouds.values())
def test_exclude_topics(exclude, pass_topic_word, renormalize, return_new_topic_mapping):
    try:
        import lda
    except ImportError:
        pytest.skip('lda not installed')

    data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle')
    model = data['model']

    exclude_ind = list(set(exclude))
    n_exclude = len(exclude_ind)
    res = model_stats.exclude_topics(exclude_ind,
                                     model.doc_topic_,
                                     model.topic_word_ if pass_topic_word else None,
                                     renormalize=renormalize,
                                     return_new_topic_mapping=return_new_topic_mapping)

    if pass_topic_word and return_new_topic_mapping:
        assert isinstance(res, tuple)
        assert len(res) == 3
        new_theta, new_phi, topic_mapping = res
    elif pass_topic_word and not return_new_topic_mapping:
        assert isinstance(res, tuple)
        assert len(res) == 2
        new_theta, new_phi = res
    elif not pass_topic_word and return_new_topic_mapping:
        assert isinstance(res, tuple)
        assert len(res) == 2
        new_theta, topic_mapping = res
    else:  # not pass_topic_word and not return_new_topic_mapping:
        assert not isinstance(res, tuple)
        new_theta = res

    assert new_theta.shape == (model.doc_topic_.shape[0], model.doc_topic_.shape[1] - n_exclude)

    if pass_topic_word:
        assert new_phi.shape == (model.topic_word_.shape[0] - n_exclude, model.topic_word_.shape[1])

    if new_theta.shape[1] > 0:
        if renormalize:
            assert np.allclose(np.sum(new_theta, axis=1), 1)
        else:
            assert np.all(np.sum(new_theta, axis=1) <= 1 + 1e-5)

            if return_new_topic_mapping:
                old_indices = list(topic_mapping.keys())
                new_indices = list(topic_mapping.values())
                assert len(old_indices) == len(new_indices) == new_theta.shape[1]
                assert 0 <= min(old_indices) < model.doc_topic_.shape[1]
                assert 0 <= max(old_indices) < model.doc_topic_.shape[1]
                assert 0 <= min(new_indices) < new_theta.shape[1]
                assert 0 <= max(new_indices) < new_theta.shape[1]

                for old_ind, new_ind in topic_mapping.items():
                    old_t = model.doc_topic_[:, old_ind]
                    new_t = new_theta[:, new_ind]
                    assert np.allclose(old_t, new_t)

                    if pass_topic_word:
                        assert np.allclose(model.topic_word_[old_ind, :], new_phi[new_ind, :])
    def test_generate_wordclouds_for_document_topics():
        py3file = '.py3' if six.PY3 else ''
        data = model_io.load_ldamodel_from_pickle(
            'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file)
        model = data['model']
        doc_labels = data['doc_labels']

        theta = model.doc_topic_
        assert theta.shape == (len(doc_labels), 5)

        doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(
            theta, doc_labels, 3)
        assert len(doc_topic_clouds) == len(doc_labels)
        assert set(doc_topic_clouds.keys()) == set(doc_labels)
        assert all(
            isinstance(wc, PIL.Image.Image)
            for wc in doc_topic_clouds.values())

        which_docs = doc_labels[:2]
        assert len(which_docs) == 2
        doc_topic_clouds = visualize.generate_wordclouds_for_document_topics(
            theta,
            doc_labels,
            3,
            which_documents=which_docs,
            return_images=False,
            width=640,
            height=480)
        assert set(doc_topic_clouds.keys()) == set(which_docs)
        assert all(
            isinstance(wc, WordCloud) for wc in doc_topic_clouds.values())
        assert all(wc.width == 640 and wc.height == 480
                   for wc in doc_topic_clouds.values())
    def test_generate_wordclouds_for_topic_words():
        py3file = '.py3' if six.PY3 else ''
        data = model_io.load_ldamodel_from_pickle(
            'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file)
        model = data['model']
        vocab = data['vocab']

        phi = model.topic_word_
        assert phi.shape == (5, len(vocab))

        topic_word_clouds = visualize.generate_wordclouds_for_topic_words(
            phi, vocab, 10)
        assert len(topic_word_clouds) == 5
        assert set(topic_word_clouds.keys()) == set('topic_%d' % i
                                                    for i in range(1, 6))
        assert all(
            isinstance(wc, PIL.Image.Image)
            for wc in topic_word_clouds.values())

        topic_word_clouds = visualize.generate_wordclouds_for_topic_words(
            phi,
            vocab,
            10,
            which_topics=('topic_1', 'topic_2'),
            return_images=False,
            width=640,
            height=480)
        assert set(topic_word_clouds.keys()) == {'topic_1', 'topic_2'}
        assert all(
            isinstance(wc, WordCloud) for wc in topic_word_clouds.values())
        assert all(wc.width == 640 and wc.height == 480
                   for wc in topic_word_clouds.values())
Esempio n. 6
0
def test_save_load_ldamodel_pickle():
    pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle'

    dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]])
    doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])]
    vocab = ['word_' + str(i) for i in range(dtm.shape[1])]

    model = lda.LDA(2, n_iter=1)
    model.fit(dtm)

    model_io.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels)

    unpickled = model_io.load_ldamodel_from_pickle(pfile)

    assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_)
    assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_)
    assert vocab == unpickled['vocab']
    assert doc_labels == unpickled['doc_labels']
    def test_write_wordclouds_to_folder(tmpdir):
        path = tmpdir.mkdir('wordclouds').dirname

        py3file = '.py3' if six.PY3 else ''
        data = model_io.load_ldamodel_from_pickle(
            'tests/data/tiny_model_reuters_5_topics%s.pickle' % py3file)
        model = data['model']
        vocab = data['vocab']

        phi = model.topic_word_
        assert phi.shape == (5, len(vocab))

        topic_word_clouds = visualize.generate_wordclouds_for_topic_words(
            phi, vocab, 10)

        visualize.write_wordclouds_to_folder(topic_word_clouds, path,
                                             'cloud_{label}.png')

        for label in topic_word_clouds.keys():
            assert os.path.exists(
                os.path.join(path, 'cloud_{label}.png'.format(label=label)))
def test_write_wordclouds_to_folder(tmpdir):
    try:
        import lda
        import PIL
        from wordcloud import WordCloud
    except ImportError:
        pytest.skip('at least one of lda, Pillow, wordcloud not installed')

    path = tmpdir.mkdir('wordclouds').dirname

    data = model_io.load_ldamodel_from_pickle('tests/data/tiny_model_reuters_5_topics.pickle')
    model = data['model']
    vocab = data['vocab']

    phi = model.topic_word_
    assert phi.shape == (5, len(vocab))

    topic_word_clouds = visualize.generate_wordclouds_for_topic_words(phi, vocab, 10)

    visualize.write_wordclouds_to_folder(topic_word_clouds, path, 'cloud_{label}.png')

    for label in topic_word_clouds.keys():
        assert os.path.exists(os.path.join(path, 'cloud_{label}.png'.format(label=label)))