Beispiel #1
0
def ldamodel_top_doc_topics(doc_topic_distrib, doc_labels, top_n=3, val_fmt=None, col_labels=None, index_name=None):
    df_values = top_n_from_distribution(doc_topic_distrib, top_n=top_n,
                                        row_labels=doc_labels, val_labels=None)
    df_labels = top_n_from_distribution(doc_topic_distrib, top_n=top_n,
                                        row_labels=doc_labels, val_labels=DEFAULT_TOPIC_NAME_FMT)
    return _join_value_and_label_dfs(df_values, df_labels, top_n, row_labels=doc_labels,
                                     val_fmt=val_fmt, col_labels=col_labels, index_name=index_name)
Beispiel #2
0
def ldamodel_top_topic_words(topic_word_distrib, vocab, top_n=10, val_fmt=None, col_labels=None, index_name=None):
    df_values = top_n_from_distribution(topic_word_distrib, top_n=top_n,
                                        row_labels=DEFAULT_TOPIC_NAME_FMT, val_labels=None)
    df_labels = top_n_from_distribution(topic_word_distrib, top_n=top_n,
                                        row_labels=DEFAULT_TOPIC_NAME_FMT, val_labels=vocab)
    return _join_value_and_label_dfs(df_values, df_labels, top_n, row_labels=DEFAULT_TOPIC_NAME_FMT,
                                     val_fmt=val_fmt, col_labels=col_labels, index_name=index_name)
Beispiel #3
0
def generate_wordclouds_from_distribution(distrib,
                                          row_labels,
                                          val_labels,
                                          top_n,
                                          which_rows=None,
                                          return_images=True,
                                          **wordcloud_kwargs):
    prob = top_n_from_distribution(distrib,
                                   top_n=top_n,
                                   row_labels=row_labels,
                                   val_labels=None)
    words = top_n_from_distribution(distrib,
                                    top_n=top_n,
                                    row_labels=row_labels,
                                    val_labels=val_labels)

    if which_rows:
        prob = prob.loc[which_rows, :]
        words = words.loc[which_rows, :]

        assert prob.shape == words.shape

    wordclouds = {}
    for (p_row_name, p), (w_row_name, w) in zip(prob.iterrows(),
                                                words.iterrows()):
        assert p_row_name == w_row_name
        logger.info('generating wordcloud for `%s`' % p_row_name)
        wc = generate_wordcloud_from_probabilities_and_words(
            p, w, return_image=return_images, **wordcloud_kwargs)
        wordclouds[p_row_name] = wc

    return wordclouds
Beispiel #4
0
def generate_wordclouds_from_distribution(distrib,
                                          row_labels,
                                          val_labels,
                                          top_n,
                                          which_rows=None,
                                          return_images=True,
                                          **wordcloud_kwargs):
    """
    Generate wordclouds for each row in a given probability distribution `distrib`.

    .. note:: Use :func:`~tmtoolkit.topicmod.visualize.generate_wordclouds_for_topic_words` or
              :func:`~tmtoolkit.topicmod.visualize.generate_wordclouds_for_document_topics` as shortcuts for creating
              wordclouds for a topic-word or document-topic distribution.

    :param distrib: 2D (sparse) array/matrix probability distribution
    :param row_labels: labels for rows in probability distribution; these are used as keys in the return dict
    :param val_labels: labels for values in probability distribution (e.g. vocabulary)
    :param top_n: number of top values to take from each row of `distrib`
    :param which_rows: if not None, select only the rows from this sequence of indices from `distrib`
    :param return_images: if True, store image objects instead of :class:`wordcloud.WordCloud` objects in the result
                          dict
    :param wordcloud_kwargs: pass additional options to :class:`wordcloud.WordCloud`; updates options in
           :data:`~tmtoolkit.topicmod.visualize.DEFAULT_WORDCLOUD_KWARGS`
    :return: dict mapping row labels to wordcloud images or instances generated from each distribution row
    """

    prob = top_n_from_distribution(distrib,
                                   top_n=top_n,
                                   row_labels=row_labels,
                                   val_labels=None)
    words = top_n_from_distribution(distrib,
                                    top_n=top_n,
                                    row_labels=row_labels,
                                    val_labels=val_labels)

    if which_rows:
        prob = prob.loc[which_rows, :]
        words = words.loc[which_rows, :]

        assert prob.shape == words.shape

    wordclouds = {}
    for (p_row_name, p), (w_row_name, w) in zip(prob.iterrows(),
                                                words.iterrows()):
        assert p_row_name == w_row_name
        logger.info('generating wordcloud for `%s`' % p_row_name)
        wc = generate_wordcloud_from_probabilities_and_words(
            p, w, return_image=return_images, **wordcloud_kwargs)
        wordclouds[p_row_name] = wc

    return wordclouds
Beispiel #5
0
def print_ldamodel_distribution(distrib, row_labels, val_labels, top_n=10):
    """
    Print `n_top` top values from a LDA model's distribution `distrib`. Can be used for topic-word distributions and
    document-topic distributions.
    """

    df_values = top_n_from_distribution(distrib, top_n=top_n, row_labels=row_labels, val_labels=None)
    df_labels = top_n_from_distribution(distrib, top_n=top_n, row_labels=row_labels, val_labels=val_labels)

    for i, (ind, row) in enumerate(df_labels.iterrows()):
        print(ind)
        for j, label in enumerate(row):
            val = df_values.iloc[i, j]
            print('> #%d. %s (%f)' % (j + 1, label, val))
def test_top_n_from_distribution(n, distrib):
    if len(distrib) == 0:
        with pytest.raises(ValueError):
            model_stats.top_n_from_distribution(distrib, n)
    else:
        if n < 1 or n > distrib.shape[1]:
            with pytest.raises(ValueError):
                model_stats.top_n_from_distribution(distrib, n)
        else:
            df = model_stats.top_n_from_distribution(distrib, n)

            assert len(df) == len(distrib)

            for _, row in df.iterrows():
                assert len(row) == n
                assert list(sorted(row, reverse=True)) == list(row)
Beispiel #7
0
def save_ldamodel_summary_to_excel(excel_file, topic_word_distrib, doc_topic_distrib, doc_labels, vocab,
                                   top_n_topics=10, top_n_words=10, dtm=None,
                                   rank_label_fmt=None, topic_label_fmt=None):
    rank_label_fmt = rank_label_fmt or DEFAULT_RANK_NAME_FMT
    topic_label_fmt = topic_label_fmt or DEFAULT_TOPIC_NAME_FMT
    excel_writer = pd.ExcelWriter(excel_file)
    sheets = OrderedDict()

    # doc-topic distribution sheets
    sheets['top_doc_topics_vals'] = top_n_from_distribution(doc_topic_distrib, top_n=top_n_topics,
                                                            row_labels=doc_labels,
                                                            col_labels=rank_label_fmt)
    sheets['top_doc_topics_labels'] = top_n_from_distribution(doc_topic_distrib, top_n=top_n_topics,
                                                              row_labels=doc_labels,
                                                              col_labels=rank_label_fmt,
                                                              val_labels=topic_label_fmt)
    sheets['top_doc_topics_labelled_vals'] = ldamodel_top_doc_topics(doc_topic_distrib, doc_labels, top_n=top_n_topics)

    # topic-word distribution sheets
    sheets['top_topic_word_vals'] = top_n_from_distribution(topic_word_distrib, top_n=top_n_words,
                                                            row_labels=topic_label_fmt,
                                                            col_labels=rank_label_fmt)
    sheets['top_topic_word_labels'] = top_n_from_distribution(topic_word_distrib, top_n=top_n_words,
                                                              row_labels=topic_label_fmt,
                                                              col_labels=rank_label_fmt,
                                                              val_labels=vocab)
    sheets['top_topic_words_labelled_vals'] = ldamodel_top_topic_words(topic_word_distrib, vocab, top_n=top_n_words)

    if dtm is not None:
        doc_lengths = get_doc_lengths(dtm)
        marg_topic_distr = get_marginal_topic_distrib(doc_topic_distrib, doc_lengths)
        row_names = [DEFAULT_TOPIC_NAME_FMT.format(i0=i, i1=i + 1) for i in range(len(marg_topic_distr))]
        sheets['marginal_topic_distrib'] = pd.DataFrame(marg_topic_distr, columns=['marginal_topic_distrib'],
                                                        index=row_names)

    for sh_name, sh_data in sheets.items():
        sh_data.to_excel(excel_writer, sh_name)

    excel_writer.save()

    return sheets