Beispiel #1
0
def test_mat2d_window_from_indices(mat, n_row_indices, n_col_indices, copy):
    mat = np.array(mat)

    n_rows, n_cols = mat.shape

    if n_row_indices == 0:
        row_indices = None
    else:
        row_indices = np.random.choice(np.arange(n_rows),
                                       size=min(n_rows, n_row_indices),
                                       replace=False)

    if n_col_indices == 0:
        col_indices = None
    else:
        col_indices = np.random.choice(np.arange(n_cols),
                                       size=min(n_cols, n_col_indices),
                                       replace=False)

    window = mat2d_window_from_indices(mat, row_indices, col_indices, copy)

    if row_indices is None:
        asserted_y_shape = n_rows
    else:
        asserted_y_shape = len(row_indices)
    assert window.shape[0] == asserted_y_shape

    if col_indices is None:
        asserted_x_shape = n_cols
    else:
        asserted_x_shape = len(col_indices)
    assert window.shape[1] == asserted_x_shape

    if row_indices is None:
        row_indices_check = np.arange(n_rows)
    else:
        row_indices_check = row_indices

    if col_indices is None:
        col_indices_check = np.arange(n_cols)
    else:
        col_indices_check = col_indices

    for w_y, m_y in enumerate(row_indices_check):
        for w_x, m_x in enumerate(col_indices_check):
            assert window[w_y, w_x] == mat[m_y, m_x]
Beispiel #2
0
def plot_topic_word_heatmap(fig,
                            ax,
                            topic_word_distrib,
                            vocab,
                            which_topics=None,
                            which_topic_indices=None,
                            which_words=None,
                            which_word_indices=None,
                            xaxislabel=None,
                            yaxislabel=None,
                            **kwargs):
    """
    Plot a heatmap for a topic-word distribution `topic_word_distrib` to a matplotlib Figure `fig` and Axes `ax`
    using `vocab` as vocabulary on the x-axis and topics from 1 to `n_topics=doc_topic_distrib.shape[1]` on
    the y-axis.
    A subset of words from `vocab` can be specified either directly with a sequence `which_words` or
    `which_document_indices` containing a sequence of word indices in `vocab`.
    A subset of topics can be specified either with a sequence `which_topics` containing sequence of numbers between
    [1, n_topics] or `which_topic_indices` which is a number between [0, n_topics-1]
    Additional arguments can be passed via `kwargs` to `plot_heatmap`.

    Please note that it is almost always necessary to select a subset of your topic-word distribution with the
    `which_words` or `which_topics` parameters, as otherwise the amount of data to be plotted will be too high
    to give a reasonable picture.
    """
    if which_topics is not None and which_topic_indices is not None:
        raise ValueError(
            'only `which_topics` or `which_topic_indices` can be set, not both'
        )

    if which_words is not None and which_word_indices is not None:
        raise ValueError(
            'only `which_words` or `which_word_indices` can be set, not both')

    if which_topics is not None:
        which_topic_indices = np.array(which_topics) - 1

    if which_words is not None:
        which_word_indices = np.where(np.isin(vocab, which_words))[0]

    select_distrib_subset = False
    topic_labels = np.array(range(1, topic_word_distrib.shape[0] + 1))

    if which_topic_indices is not None:
        select_distrib_subset = True
        topic_labels = topic_labels[which_topic_indices]

    if which_word_indices is not None:
        select_distrib_subset = True
        vocab = np.array(vocab)[which_word_indices]

    if select_distrib_subset:
        topic_word_distrib = mat2d_window_from_indices(topic_word_distrib,
                                                       which_topic_indices,
                                                       which_word_indices)

    return plot_heatmap(fig,
                        ax,
                        topic_word_distrib,
                        xaxislabel=xaxislabel or 'vocab',
                        yaxislabel=yaxislabel or 'topic',
                        xticklabels=vocab,
                        yticklabels=topic_labels,
                        **kwargs)
Beispiel #3
0
def plot_doc_topic_heatmap(fig,
                           ax,
                           doc_topic_distrib,
                           doc_labels,
                           topic_labels=None,
                           which_documents=None,
                           which_document_indices=None,
                           which_topics=None,
                           which_topic_indices=None,
                           xaxislabel=None,
                           yaxislabel=None,
                           **kwargs):
    """
    Plot a heatmap for a document-topic distribution `doc_topic_distrib` to a matplotlib Figure `fig` and Axes `ax`
    using `doc_labels` as document labels on the y-axis and topics from 1 to `n_topics=doc_topic_distrib.shape[1]` on
    the x-axis.
    Custom topic labels can be passed as `topic_labels`.
    A subset of documents can be specified either with a sequence `which_documents` containing a subset of document
    labels from `doc_labels` or `which_document_indices` containing a sequence of document indices.
    A subset of topics can be specified either with a sequence `which_topics` containing sequence of numbers between
    [1, n_topics] or `which_topic_indices` which is a number between [0, n_topics-1]
    Additional arguments can be passed via `kwargs` to `plot_heatmap`.

    Please note that it is almost always necessary to select a subset of your document-topic distribution with the
    `which_documents` or `which_topics` parameters, as otherwise the amount of data to be plotted will be too high
    to give a reasonable picture.
    """
    if which_documents is not None and which_document_indices is not None:
        raise ValueError(
            'only `which_documents` or `which_document_indices` can be set, not both'
        )

    if which_topics is not None and which_topic_indices is not None:
        raise ValueError(
            'only `which_topics` or `which_topic_indices` can be set, not both'
        )

    if which_documents is not None:
        which_document_indices = np.where(np.isin(doc_labels,
                                                  which_documents))[0]

    if which_topics is not None:
        which_topic_indices = np.array(which_topics) - 1

    select_distrib_subset = False

    if topic_labels is None:
        topic_labels = np.array(range(1, doc_topic_distrib.shape[1] + 1))
    elif not isinstance(topic_labels, np.ndarray):
        topic_labels = np.array(topic_labels)

    if which_document_indices is not None:
        select_distrib_subset = True
        doc_labels = np.array(doc_labels)[which_document_indices]

    if which_topic_indices is not None:
        select_distrib_subset = True
        topic_labels = topic_labels[which_topic_indices]

    if select_distrib_subset:
        doc_topic_distrib = mat2d_window_from_indices(doc_topic_distrib,
                                                      which_document_indices,
                                                      which_topic_indices)

    return plot_heatmap(fig,
                        ax,
                        doc_topic_distrib,
                        xaxislabel=xaxislabel or 'topic',
                        yaxislabel=yaxislabel or 'document',
                        xticklabels=topic_labels,
                        yticklabels=doc_labels,
                        **kwargs)
Beispiel #4
0
def plot_topic_word_heatmap(fig,
                            ax,
                            topic_word_distrib,
                            vocab,
                            topic_labels=None,
                            which_topics=None,
                            which_topic_indices=None,
                            which_words=None,
                            which_word_indices=None,
                            xaxislabel=None,
                            yaxislabel=None,
                            **kwargs):
    """
    Plot a heatmap for a topic-word distribution `topic_word_distrib` to a matplotlib Figure `fig` and Axes `ax`
    using `vocab` as vocabulary on the x-axis and topics from 1 to `n_topics=doc_topic_distrib.shape[1]` on
    the y-axis.


    .. note:: It is almost always necessary to select a subset of your topic-word distribution with the
              `which_words` or `which_topics` parameters, as otherwise the amount of data to be plotted will be too high
              to give a reasonable picture.

    :param fig: matplotlib Figure object
    :param ax: matplotlib Axes object
    :param topic_word_distrib: topic-word distribution; shape KxM, where K is number of topics, M is vocabulary size
    :param vocab: vocabulary array of length M
    :param topic_labels: labels used for each row; either single format string with
                         placeholders ``"{i0}"`` (zero-based topic index) or ``"{i1}"`` (one-based topic index), or
                         list of topic label strings
    :param which_topics: select topics via topic label strings (when string array or list and `topic_labels` is given)
                         or with one-based topic index in [1, K] (when integer array or list)
    :param which_topic_indices:  alternatively, select topics with zero-based topic index in [0, K-1]
    :param which_words: select words with one-based word index in [1, M]
    :param which_word_indices: alternatively, select words with zero-based word index in [0, K-1]
    :param xaxislabel: x axis label string
    :param yaxislabel: y axis label string
    :param kwargs: additional arguments passed to :func:`~tmtoolkit.topicmod.visualize.plot_heatmap`
    :return: tuple of generated (matplotlib Figure object, matplotlib Axes object)
    """
    if not isinstance(topic_word_distrib,
                      np.ndarray) or topic_word_distrib.ndim != 2:
        raise ValueError('`mat` must be a 2D NumPy array')

    if topic_word_distrib.shape[0] == 0 or topic_word_distrib.shape[1] == 0:
        raise ValueError('invalid shape for `mat`: %s' %
                         str(topic_word_distrib.shape))

    if which_topics is not None and which_topic_indices is not None:
        raise ValueError(
            'only `which_topics` or `which_topic_indices` can be set, not both'
        )

    if which_words is not None and which_word_indices is not None:
        raise ValueError(
            'only `which_words` or `which_word_indices` can be set, not both')

    if which_words is not None:
        which_word_indices = np.where(np.isin(vocab, which_words))[0]

    select_distrib_subset = False

    if topic_labels is None:
        topic_labels = np.array(range(1, topic_word_distrib.shape[0] + 1))
    elif not isinstance(topic_labels, np.ndarray):
        topic_labels = np.array(topic_labels)

    if which_topics is not None:
        which_topics = np.array(which_topics)
        if np.issubdtype(which_topics.dtype, np.str):
            which_topic_indices = np.where(np.isin(topic_labels,
                                                   which_topics))[0]
        else:
            which_topic_indices = which_topics - 1

    if which_topic_indices is not None:
        select_distrib_subset = True
        topic_labels = topic_labels[which_topic_indices]

    if which_word_indices is not None:
        select_distrib_subset = True
        vocab = np.array(vocab)[which_word_indices]

    if select_distrib_subset:
        topic_word_distrib = mat2d_window_from_indices(topic_word_distrib,
                                                       which_topic_indices,
                                                       which_word_indices)

    return plot_heatmap(fig,
                        ax,
                        topic_word_distrib,
                        xaxislabel=xaxislabel or 'vocab',
                        yaxislabel=yaxislabel or 'topic',
                        xticklabels=vocab,
                        yticklabels=topic_labels,
                        **kwargs)
Beispiel #5
0
def plot_doc_topic_heatmap(fig,
                           ax,
                           doc_topic_distrib,
                           doc_labels,
                           topic_labels=None,
                           which_documents=None,
                           which_document_indices=None,
                           which_topics=None,
                           which_topic_indices=None,
                           xaxislabel=None,
                           yaxislabel=None,
                           **kwargs):
    """
    Plot a heatmap for a document-topic distribution `doc_topic_distrib` to a matplotlib Figure `fig` and Axes `ax`
    using `doc_labels` as document labels on the y-axis and topics from 1 to K (number of topics) on
    the x-axis.

    .. note:: It is almost always necessary to select a subset of your document-topic distribution with the
              `which_documents` or `which_topics` parameters, as otherwise the amount of data to be plotted will be too
              high to give a reasonable picture.

    :param fig: matplotlib Figure object
    :param ax: matplotlib Axes object
    :param doc_topic_distrib: document-topic distribution; shape NxK, where N is the number of documents, K is the
                              number of topics
    :param doc_labels: list/array of length N with a string label for each document
    :param topic_labels: labels used for each row; either single format string with
                         placeholders ``"{i0}"`` (zero-based topic index) or ``"{i1}"`` (one-based topic index), or
                         list of topic label strings
    :param which_documents: select documents via document label strings
    :param which_document_indices: alternatively, select documents with zero-based document index in [0, N-1]
    :param which_topics: select topics via topic label strings (when string array or list) or with
                         one-based topic index in [1, K] (when integer array or list)
    :param which_topic_indices:  alternatively, select topics with zero-based topic index in [0, K-1]
    :param xaxislabel: x axis label string
    :param yaxislabel: y axis label string
    :param kwargs: additional arguments passed to :func:`~tmtoolkit.topicmod.visualize.plot_heatmap`
    :return: tuple of generated (matplotlib Figure object, matplotlib Axes object)
    """

    if not isinstance(doc_topic_distrib,
                      np.ndarray) or doc_topic_distrib.ndim != 2:
        raise ValueError('`mat` must be a 2D NumPy array')

    if doc_topic_distrib.shape[0] == 0 or doc_topic_distrib.shape[1] == 0:
        raise ValueError('invalid shape for `mat`: %s' %
                         str(doc_topic_distrib.shape))

    if which_documents is not None and which_document_indices is not None:
        raise ValueError(
            'only `which_documents` or `which_document_indices` can be set, not both'
        )

    if which_topics is not None and which_topic_indices is not None:
        raise ValueError(
            'only `which_topics` or `which_topic_indices` can be set, not both'
        )

    if which_documents is not None:
        which_document_indices = np.where(np.isin(doc_labels,
                                                  which_documents))[0]

    select_distrib_subset = False

    if topic_labels is None:
        topic_labels = np.array(range(1, doc_topic_distrib.shape[1] + 1))
    elif not isinstance(topic_labels, np.ndarray):
        topic_labels = np.array(topic_labels)

    if which_topics is not None:
        which_topics = np.array(which_topics)
        if which_topics.dtype.kind == 'U':
            which_topic_indices = np.where(np.isin(topic_labels,
                                                   which_topics))[0]
        else:
            which_topic_indices = which_topics - 1

    if which_document_indices is not None:
        select_distrib_subset = True
        doc_labels = np.array(doc_labels)[which_document_indices]

    if which_topic_indices is not None:
        select_distrib_subset = True
        topic_labels = topic_labels[which_topic_indices]

    if select_distrib_subset:
        doc_topic_distrib = mat2d_window_from_indices(doc_topic_distrib,
                                                      which_document_indices,
                                                      which_topic_indices)

    return plot_heatmap(fig,
                        ax,
                        doc_topic_distrib,
                        xaxislabel=xaxislabel or 'topic',
                        yaxislabel=yaxislabel or 'document',
                        xticklabels=topic_labels,
                        yticklabels=doc_labels,
                        **kwargs)