Beispiel #1
0
    def label_mmr_score(self,
                        which_topic,
                        chosen_labels,
                        label_scores,
                        label_models):
        """
        Maximal Marginal Relevance score for labels.
        It's computed only when `apply_intra_topic_coverage` is True

        Parameters:
        --------------
        which_topic: int
            the index of the topic
        
        chosen_labels: list<int>
           indices of labels that are already chosen
        
        label_scores: numpy.ndarray<#topic, #label>
           label scores for each topic

        label_models: numpy.ndarray<#label, #words>
            the language models for labels

        Returns:
        --------------
        numpy.ndarray: 1D of length #label - #chosen_labels
            the scored label indices

        numpy.ndarray: same length as above
            the scores
        """
        chosen_len = len(chosen_labels)
        if chosen_len == 0:
            # no label is chosen
            # return the raw scores
            return (np.arange(label_models.shape[0]),
                    label_scores[which_topic, :])
        else:
            kl_m = np.zeros((label_models.shape[0]-chosen_len,
                             chosen_len))
            
            # the unchosen label indices
            candidate_labels = list(set(range(label_models.shape[0])) -
                                    set(chosen_labels))
            candidate_labels = np.sort(np.asarray(candidate_labels))
            for i, l_p in enumerate(candidate_labels):
                for j, l in enumerate(chosen_labels):
                    kl_m[i, j] = kl_divergence(label_models[l_p],
                                               label_models[l])
            sim_scores = kl_m.max(axis=1)
            mml_scores = (self._alpha *
                          label_scores[which_topic, candidate_labels]
                          - (1 - self._alpha) * sim_scores)
            return (candidate_labels, mml_scores)
Beispiel #2
0
def jsd_divergence(p: np.array,
                   q: np.array) -> np.float:
    """
    Calculate the Jensen-Shannon Divergence between two PDFs

    Parameters
    -----------
    p: Numpy.array
        discrete probability distribution, p
    q: Numpy.array
        discrete probability distribution, q

    Returns
    --------
    Numpy.float
        Jenson-Shannon Divergence
    """
    m = (p + q)/2
    divergence = (kl_divergence(p, m) + kl_divergence(q, m)) / 2
    return np.sqrt(divergence)
Beispiel #3
0
    def label_mmr_score(self, which_topic, chosen_labels, label_scores,
                        label_models):
        """
        Maximal Marginal Relevance score for labels.
        It's computed only when `apply_intra_topic_coverage` is True

        Parameters:
        --------------
        which_topic: int
            the index of the topic
        
        chosen_labels: list<int>
           indices of labels that are already chosen
        
        label_scores: numpy.ndarray<#topic, #label>
           label scores for each topic

        label_models: numpy.ndarray<#label, #words>
            the language models for labels

        Returns:
        --------------
        numpy.ndarray: 1D of length #label - #chosen_labels
            the scored label indices

        numpy.ndarray: same length as above
            the scores
        """
        chosen_len = len(chosen_labels)
        if chosen_len == 0:
            # no label is chosen
            # return the raw scores
            return (np.arange(label_models.shape[0]),
                    label_scores[which_topic, :])
        else:
            kl_m = np.zeros((label_models.shape[0] - chosen_len, chosen_len))

            # the unchosen label indices
            candidate_labels = list(
                set(range(label_models.shape[0])) - set(chosen_labels))
            candidate_labels = np.sort(np.asarray(candidate_labels))
            for i, l_p in enumerate(candidate_labels):
                for j, l in enumerate(chosen_labels):
                    kl_m[i, j] = kl_divergence(label_models[l_p],
                                               label_models[l])
            sim_scores = kl_m.max(axis=1)
            mml_scores = (
                self._alpha * label_scores[which_topic, candidate_labels] -
                (1 - self._alpha) * sim_scores)
            return (candidate_labels, mml_scores)
def kullback_leibler_divergence(d1, d2):
    """Computes the Kullback-Leibler dissimilarity between two probability distributions.

    Args:
        d1 (dict): First probability distribution.
        d2 (dict): Second probability distribution.

    Returns:
        The Kullback-Leibler dissimilarity value.

    Note:
        * The KL divergence is not a true "distance" because it is asymmetric and, it does not satisfy the triangle inequality.

    """

    # validating distributions
    if not isinstance(d1, dict) or not isinstance(d2, dict):
        raise ValueError('Distributions must be dictionaries.')

    # insignificant value
    eps = 1e-6

    # declaring probability density functions
    pdf1 = []
    pdf2 = []

    # for each 'event' in distribution 1
    for k1 in d1:
        # adding p(k1) to pdf1
        pdf1.append(d1[k1])

        # if event 'k1' also occurs in d2
        if k1 in d2.keys():
            pdf2.append(d2[k1])
        else:
            pdf2.append(0)

    # for events only occurring in d2
    for k2 in d2:
        # if event k2 only occurs in d2
        if k2 not in d1.keys():
            pdf1.append(0)
            pdf2.append(d2[k2])

    # building array distributions
    arr_pdf1 = np.array(pdf1) + eps
    arr_pdf2 = np.array(pdf2) + eps

    # computing the distance between the 2 probability density functions
    return kl_divergence(arr_pdf1, arr_pdf2)
Beispiel #5
0
    def dice_func_entropy_conditioned(dist: Dict[int, int], dpmf: DicePmf):
        """Calls scipy.stats.entropy, but avoids result being inf"""
        def condition_pk(pk_raw):
            length = pk_raw.shape[0]
            idx_zeros = [
                k for k in range(length) if dist.get(k + dpmf.low, 0) == 0
            ]
            min_val = min(
                [pk_raw[k] for k in range(length) if k not in idx_zeros])
            extra = min_val * len(idx_zeros)
            scale = 1 / (1 + extra)
            pk = np.array([
                min_val if k in idx_zeros else scale * pk_raw[k]
                for k in range(length)
            ])
            return pk

        pk_raw, pmf = DiceUtil.dice_comparable_arrays(dist, dpmf)
        pk = condition_pk(pk_raw)

        return kl_divergence(pk, pmf)
Beispiel #6
0
 def dice_func_entropy(dist: Dict[int, int], dpmf: DicePmf):
     """Calls scipy.stats.entropy (i.e., Kullman-Leibler divergence)"""
     pk, pmf = DiceUtil.dice_comparable_arrays(dist, dpmf)
     return kl_divergence(pk, pmf)