コード例 #1
def cosine_similarity(segmented_topics,
    This function calculates the indirect cosine measure.

    Given context vectors u = V(W') and w = V(W*) for the
    word sets of a pair S_i = (W', W*) indirect cosine measure
    is computed as the cosine similarity between u and w.

    The formula used is

        m_{sim}_{(m, \gamma)}(W', W*) =
            s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))

    where each vector

        \vec{V}^{\,}_{m,\gamma}(W') =
            \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

        segmented_topics: Output from the segmentation module of the
            segmented topics. Is a list of list of tuples.
        accumulator: Output from the probability_estimation module. Is an
            accumulator of word occurrences (see text_analysis module).
        topics: Topics obtained from the trained topic model.
        measure (str): Direct confirmation measure to be used. Supported
            values are "nlr" (normalized log ratio).
        gamma: Gamma value for computing W', W* vectors; default is 1.
        with_std (bool): True to also include standard deviation across topic
            segment sets in addition to the mean coherence for each topic;
            default is False.
        with_support (bool): True to also include support across topic segments.
            The support is defined as the number of pairwise similarity
            comparisons were used to compute the overall topic coherence.

        list: of indirect cosine similarity measure for each topic.
    context_vectors = ContextVectorComputer(measure, topics, accumulator,

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

            aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
コード例 #2
def cosine_similarity(
        segmented_topics, accumulator, topics, measure='nlr', gamma=1,
        with_std=False, with_support=False):
    This function calculates the indirect cosine measure.

    Given context vectors u = V(W') and w = V(W*) for the
    word sets of a pair S_i = (W', W*) indirect cosine measure
    is computed as the cosine similarity between u and w.

    The formula used is

        m_{sim}_{(m, \gamma)}(W', W*) =
            s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))

    where each vector

        \vec{V}^{\,}_{m,\gamma}(W') =
            \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

        segmented_topics: Output from the segmentation module of the
            segmented topics. Is a list of list of tuples.
        accumulator: Output from the probability_estimation module. Is an
            accumulator of word occurrences (see text_analysis module).
        topics: Topics obtained from the trained topic model.
        measure (str): Direct confirmation measure to be used. Supported
            values are "nlr" (normalized log ratio).
        gamma: Gamma value for computing W', W* vectors; default is 1.
        with_std (bool): True to also include standard deviation across topic
            segment sets in addition to the mean coherence for each topic;
            default is False.
        with_support (bool): True to also include support across topic segments.
            The support is defined as the number of pairwise similarity
            comparisons were used to compute the overall topic coherence.

        list: of indirect cosine similarity measure for each topic.
    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
コード例 #3
def word2vec_similarity(segmented_topics,
    """For each topic segmentation, compute average cosine similarity using a

        segmented_topics (list): Output from the segmentation module of the segmented
            topics. Is a list of list of tuples.
        accumulator: word occurrence accumulator from probability_estimation.
        with_std (bool): True to also include standard deviation across topic segment
            sets in addition to the mean coherence for each topic; default is False.
        with_support (bool): True to also include support across topic segments. The
            support is defined as the number of pairwise similarity comparisons were
            used to compute the overall topic coherence.

        list : of word2vec cosine similarities per topic.
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

                    w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
            aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary",
    return topic_coherences
コード例 #4
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
    """For each topic segmentation, compute average cosine similarity using a

        segmented_topics (list): Output from the segmentation module of the segmented
            topics. Is a list of list of tuples.
        accumulator: word occurrence accumulator from probability_estimation.
        with_std (bool): True to also include standard deviation across topic segment
            sets in addition to the mean coherence for each topic; default is False.
        with_support (bool): True to also include support across topic segments. The
            support is defined as the number of pairwise similarity comparisons were
            used to compute the overall topic coherence.

        list : of word2vec cosine similarities per topic.
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

                segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
    return topic_coherences
コード例 #5
def word2vec_similarity(segmented_topics,
    """For each topic segmentation, compute average cosine similarity using a

    segmented_topics : list of lists of (int, `numpy.ndarray`)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
        Word occurrence accumulator.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets
        in addition to the mean coherence for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as
        the number of pairwise similarity comparisons were used to compute the overall topic coherence.

    list of (float[, float[, int]])
        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).

    .. sourcecode:: pycon

        >>> import numpy as np
        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure
        >>> from gensim.topic_coherence import text_analysis
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>> # should be (0.726752426218 0.00695475919227)
        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]

    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

                    w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
            aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary",
    return topic_coherences
コード例 #6
def cosine_similarity(segmented_topics,
    """Calculate the indirect cosine measure.

    segmented_topics: list of lists of (int, `numpy.ndarray`)
        Output from the segmentation module of the segmented topics.
    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
    measure : str, optional
        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma: float, optional
        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
    with_std : bool
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic; default is False.
    with_support : bool
        True to also include support across topic segments. The support is defined as the number of pairwise similarity
        comparisons were used to compute the overall topic coherence.

        List of indirect cosine similarity measure for each topic.

    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>> # create topics
        >>> topics = [np.array([1, 2])]
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
        >>> print(obtained[0])

    context_vectors = ContextVectorComputer(measure, topics, accumulator,

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

            aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
コード例 #7
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
    """For each topic segmentation, compute average cosine similarity using a

    segmented_topics : list of lists of (int, `numpy.ndarray`)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
        Word occurrence accumulator.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets
        in addition to the mean coherence for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as
        the number of pairwise similarity comparisons were used to compute the overall topic coherence.

    list of (float[, float[, int]])
        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).

    .. sourcecode:: pycon

        >>> import numpy as np
        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure
        >>> from gensim.topic_coherence import text_analysis
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>> # should be (0.726752426218 0.00695475919227)
        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]

    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

                segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
    return topic_coherences
コード例 #8
def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr',
                      gamma=1, with_std=False, with_support=False):
    """Calculate the indirect cosine measure.

    segmented_topics: list of lists of (int, `numpy.ndarray`)
        Output from the segmentation module of the segmented topics.
    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
    measure : str, optional
        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma: float, optional
        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
    with_std : bool
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic; default is False.
    with_support : bool
        True to also include support across topic segments. The support is defined as the number of pairwise similarity
        comparisons were used to compute the overall topic coherence.

        List of indirect cosine similarity measure for each topic.

    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>> # create topics
        >>> topics = [np.array([1, 2])]
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
        >>> print(obtained[0])

    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences