Ejemplo n.º 1
0
def cosine_similarity(segmented_topics,
                      accumulator,
                      topics,
                      measure='nlr',
                      gamma=1,
                      with_std=False,
                      with_support=False):
    r"""
    This function calculates the indirect cosine measure.

    Given context vectors u = V(W') and w = V(W*) for the
    word sets of a pair S_i = (W', W*) indirect cosine measure
    is computed as the cosine similarity between u and w.

    The formula used is

        m_{sim}_{(m, \gamma)}(W', W*) =
            s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))

    where each vector

        \vec{V}^{\,}_{m,\gamma}(W') =
            \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

    Args:
        segmented_topics: Output from the segmentation module of the
            segmented topics. Is a list of list of tuples.
        accumulator: Output from the probability_estimation module. Is an
            accumulator of word occurrences (see text_analysis module).
        topics: Topics obtained from the trained topic model.
        measure (str): Direct confirmation measure to be used. Supported
            values are "nlr" (normalized log ratio).
        gamma: Gamma value for computing W', W* vectors; default is 1.
        with_std (bool): True to also include standard deviation across topic
            segment sets in addition to the mean coherence for each topic;
            default is False.
        with_support (bool): True to also include support across topic segments.
            The support is defined as the number of pairwise similarity
            comparisons were used to compute the overall topic coherence.

    Returns:
        list: of indirect cosine similarity measure for each topic.
    """
    context_vectors = ContextVectorComputer(measure, topics, accumulator,
                                            gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(
            aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
def cosine_similarity(
        segmented_topics, accumulator, topics, measure='nlr', gamma=1,
        with_std=False, with_support=False):
    r"""
    This function calculates the indirect cosine measure.

    Given context vectors u = V(W') and w = V(W*) for the
    word sets of a pair S_i = (W', W*) indirect cosine measure
    is computed as the cosine similarity between u and w.

    The formula used is

        m_{sim}_{(m, \gamma)}(W', W*) =
            s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))

    where each vector

        \vec{V}^{\,}_{m,\gamma}(W') =
            \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}

    Args:
        segmented_topics: Output from the segmentation module of the
            segmented topics. Is a list of list of tuples.
        accumulator: Output from the probability_estimation module. Is an
            accumulator of word occurrences (see text_analysis module).
        topics: Topics obtained from the trained topic model.
        measure (str): Direct confirmation measure to be used. Supported
            values are "nlr" (normalized log ratio).
        gamma: Gamma value for computing W', W* vectors; default is 1.
        with_std (bool): True to also include standard deviation across topic
            segment sets in addition to the mean coherence for each topic;
            default is False.
        with_support (bool): True to also include support across topic segments.
            The support is defined as the number of pairwise similarity
            comparisons were used to compute the overall topic coherence.

    Returns:
        list: of indirect cosine similarity measure for each topic.
    """
    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
Ejemplo n.º 3
0
def word2vec_similarity(segmented_topics,
                        accumulator,
                        with_std=False,
                        with_support=False):
    """For each topic segmentation, compute average cosine similarity using a
    WordVectorsAccumulator.

    Args:
    Args:
        segmented_topics (list): Output from the segmentation module of the segmented
            topics. Is a list of list of tuples.
        accumulator: word occurrence accumulator from probability_estimation.
        with_std (bool): True to also include standard deviation across topic segment
            sets in addition to the mean coherence for each topic; default is False.
        with_support (bool): True to also include support across topic segments. The
            support is defined as the number of pairwise similarity comparisons were
            used to compute the overall topic coherence.

    Returns:
        list : of word2vec cosine similarities per topic.
    """
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

            try:
                segment_sims.append(accumulator.ids_similarity(
                    w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
            logger.warning(
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(
            aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary",
                       total_oov)
    return topic_coherences
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
    """For each topic segmentation, compute average cosine similarity using a
    WordVectorsAccumulator.

    Args:
        segmented_topics (list): Output from the segmentation module of the segmented
            topics. Is a list of list of tuples.
        accumulator: word occurrence accumulator from probability_estimation.
        with_std (bool): True to also include standard deviation across topic segment
            sets in addition to the mean coherence for each topic; default is False.
        with_support (bool): True to also include support across topic segments. The
            support is defined as the number of pairwise similarity comparisons were
            used to compute the overall topic coherence.

    Returns:
        list : of word2vec cosine similarities per topic.
    """
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

            try:
                segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
            logger.warning(
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
    return topic_coherences
def word2vec_similarity(segmented_topics,
                        accumulator,
                        with_std=False,
                        with_support=False):
    """For each topic segmentation, compute average cosine similarity using a
    :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.

    Parameters
    ----------
    segmented_topics : list of lists of (int, `numpy.ndarray`)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets
        in addition to the mean coherence for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as
        the number of pairwise similarity comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list of (float[, float[, int]])
        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).

    Examples
    --------
    .. sourcecode:: pycon

        >>> import numpy as np
        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure
        >>> from gensim.topic_coherence import text_analysis
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>>
        >>> # should be (0.726752426218 0.00695475919227)
        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]

    """
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

            try:
                segment_sims.append(accumulator.ids_similarity(
                    w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
            logger.warning(
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(
            aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary",
                       total_oov)
    return topic_coherences
def cosine_similarity(segmented_topics,
                      accumulator,
                      topics,
                      measure='nlr',
                      gamma=1,
                      with_std=False,
                      with_support=False):
    """Calculate the indirect cosine measure.

    Parameters
    ----------
    segmented_topics: list of lists of (int, `numpy.ndarray`)
        Output from the segmentation module of the segmented topics.
    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
    measure : str, optional
        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma: float, optional
        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
    with_std : bool
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic; default is False.
    with_support : bool
        True to also include support across topic segments. The support is defined as the number of pairwise similarity
        comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list
        List of indirect cosine similarity measure for each topic.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>>
        >>> # create topics
        >>> topics = [np.array([1, 2])]
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
        >>> print(obtained[0])
        0.623018926945

    """
    context_vectors = ContextVectorComputer(measure, topics, accumulator,
                                            gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(
            aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
    """For each topic segmentation, compute average cosine similarity using a
    :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.

    Parameters
    ----------
    segmented_topics : list of lists of (int, `numpy.ndarray`)
        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Word occurrence accumulator.
    with_std : bool, optional
        True to also include standard deviation across topic segment sets
        in addition to the mean coherence for each topic.
    with_support : bool, optional
        True to also include support across topic segments. The support is defined as
        the number of pairwise similarity comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list of (float[, float[, int]])
        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).

    Examples
    --------
    .. sourcecode:: pycon

        >>> import numpy as np
        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure
        >>> from gensim.topic_coherence import text_analysis
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
        >>>
        >>> # should be (0.726752426218 0.00695475919227)
        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]

    """
    topic_coherences = []
    total_oov = 0

    for topic_index, topic_segments in enumerate(segmented_topics):
        segment_sims = []
        num_oov = 0
        for w_prime, w_star in topic_segments:
            if not hasattr(w_prime, '__iter__'):
                w_prime = [w_prime]
            if not hasattr(w_star, '__iter__'):
                w_star = [w_star]

            try:
                segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
            except ZeroDivisionError:
                num_oov += 1

        if num_oov > 0:
            total_oov += 1
            logger.warning(
                "%d terms for topic %d are not in word2vec model vocabulary",
                num_oov, topic_index)
        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    if total_oov > 0:
        logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
    return topic_coherences
def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr',
                      gamma=1, with_std=False, with_support=False):
    """Calculate the indirect cosine measure.

    Parameters
    ----------
    segmented_topics: list of lists of (int, `numpy.ndarray`)
        Output from the segmentation module of the segmented topics.
    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
    measure : str, optional
        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
    gamma: float, optional
        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
    with_std : bool
        True to also include standard deviation across topic segment sets in addition to the mean coherence
        for each topic; default is False.
    with_support : bool
        True to also include support across topic segments. The support is defined as the number of pairwise similarity
        comparisons were used to compute the overall topic coherence.

    Returns
    -------
    list
        List of indirect cosine similarity measure for each topic.

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.corpora.dictionary import Dictionary
        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
        >>> import numpy as np
        >>>
        >>> # create accumulator
        >>> dictionary = Dictionary()
        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
        >>> accumulator._num_docs = 5
        >>>
        >>> # create topics
        >>> topics = [np.array([1, 2])]
        >>>
        >>> # create segmentation
        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
        >>> print(obtained[0])
        0.623018926945

    """
    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)

    topic_coherences = []
    for topic_words, topic_segments in zip(topics, segmented_topics):
        topic_words = tuple(topic_words)  # because tuples are hashable
        segment_sims = np.zeros(len(topic_segments))
        for i, (w_prime, w_star) in enumerate(topic_segments):
            w_prime_cv = context_vectors[w_prime, topic_words]
            w_star_cv = context_vectors[w_star, topic_words]
            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)

        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))

    return topic_coherences