def label_mmr_score(self, which_topic, chosen_labels, label_scores, label_models): """ Maximal Marginal Relevance score for labels. It's computed only when `apply_intra_topic_coverage` is True Parameters: -------------- which_topic: int the index of the topic chosen_labels: list<int> indices of labels that are already chosen label_scores: numpy.ndarray<#topic, #label> label scores for each topic label_models: numpy.ndarray<#label, #words> the language models for labels Returns: -------------- numpy.ndarray: 1D of length #label - #chosen_labels the scored label indices numpy.ndarray: same length as above the scores """ chosen_len = len(chosen_labels) if chosen_len == 0: # no label is chosen # return the raw scores return (np.arange(label_models.shape[0]), label_scores[which_topic, :]) else: kl_m = np.zeros((label_models.shape[0]-chosen_len, chosen_len)) # the unchosen label indices candidate_labels = list(set(range(label_models.shape[0])) - set(chosen_labels)) candidate_labels = np.sort(np.asarray(candidate_labels)) for i, l_p in enumerate(candidate_labels): for j, l in enumerate(chosen_labels): kl_m[i, j] = kl_divergence(label_models[l_p], label_models[l]) sim_scores = kl_m.max(axis=1) mml_scores = (self._alpha * label_scores[which_topic, candidate_labels] - (1 - self._alpha) * sim_scores) return (candidate_labels, mml_scores)
def jsd_divergence(p: np.array, q: np.array) -> np.float: """ Calculate the Jensen-Shannon Divergence between two PDFs Parameters ----------- p: Numpy.array discrete probability distribution, p q: Numpy.array discrete probability distribution, q Returns -------- Numpy.float Jenson-Shannon Divergence """ m = (p + q)/2 divergence = (kl_divergence(p, m) + kl_divergence(q, m)) / 2 return np.sqrt(divergence)
def label_mmr_score(self, which_topic, chosen_labels, label_scores, label_models): """ Maximal Marginal Relevance score for labels. It's computed only when `apply_intra_topic_coverage` is True Parameters: -------------- which_topic: int the index of the topic chosen_labels: list<int> indices of labels that are already chosen label_scores: numpy.ndarray<#topic, #label> label scores for each topic label_models: numpy.ndarray<#label, #words> the language models for labels Returns: -------------- numpy.ndarray: 1D of length #label - #chosen_labels the scored label indices numpy.ndarray: same length as above the scores """ chosen_len = len(chosen_labels) if chosen_len == 0: # no label is chosen # return the raw scores return (np.arange(label_models.shape[0]), label_scores[which_topic, :]) else: kl_m = np.zeros((label_models.shape[0] - chosen_len, chosen_len)) # the unchosen label indices candidate_labels = list( set(range(label_models.shape[0])) - set(chosen_labels)) candidate_labels = np.sort(np.asarray(candidate_labels)) for i, l_p in enumerate(candidate_labels): for j, l in enumerate(chosen_labels): kl_m[i, j] = kl_divergence(label_models[l_p], label_models[l]) sim_scores = kl_m.max(axis=1) mml_scores = ( self._alpha * label_scores[which_topic, candidate_labels] - (1 - self._alpha) * sim_scores) return (candidate_labels, mml_scores)
def kullback_leibler_divergence(d1, d2): """Computes the Kullback-Leibler dissimilarity between two probability distributions. Args: d1 (dict): First probability distribution. d2 (dict): Second probability distribution. Returns: The Kullback-Leibler dissimilarity value. Note: * The KL divergence is not a true "distance" because it is asymmetric and, it does not satisfy the triangle inequality. """ # validating distributions if not isinstance(d1, dict) or not isinstance(d2, dict): raise ValueError('Distributions must be dictionaries.') # insignificant value eps = 1e-6 # declaring probability density functions pdf1 = [] pdf2 = [] # for each 'event' in distribution 1 for k1 in d1: # adding p(k1) to pdf1 pdf1.append(d1[k1]) # if event 'k1' also occurs in d2 if k1 in d2.keys(): pdf2.append(d2[k1]) else: pdf2.append(0) # for events only occurring in d2 for k2 in d2: # if event k2 only occurs in d2 if k2 not in d1.keys(): pdf1.append(0) pdf2.append(d2[k2]) # building array distributions arr_pdf1 = np.array(pdf1) + eps arr_pdf2 = np.array(pdf2) + eps # computing the distance between the 2 probability density functions return kl_divergence(arr_pdf1, arr_pdf2)
def dice_func_entropy_conditioned(dist: Dict[int, int], dpmf: DicePmf): """Calls scipy.stats.entropy, but avoids result being inf""" def condition_pk(pk_raw): length = pk_raw.shape[0] idx_zeros = [ k for k in range(length) if dist.get(k + dpmf.low, 0) == 0 ] min_val = min( [pk_raw[k] for k in range(length) if k not in idx_zeros]) extra = min_val * len(idx_zeros) scale = 1 / (1 + extra) pk = np.array([ min_val if k in idx_zeros else scale * pk_raw[k] for k in range(length) ]) return pk pk_raw, pmf = DiceUtil.dice_comparable_arrays(dist, dpmf) pk = condition_pk(pk_raw) return kl_divergence(pk, pmf)
def dice_func_entropy(dist: Dict[int, int], dpmf: DicePmf): """Calls scipy.stats.entropy (i.e., Kullman-Leibler divergence)""" pk, pmf = DiceUtil.dice_comparable_arrays(dist, dpmf) return kl_divergence(pk, pmf)