def _get_JS(p, p_weights, q, q_weights, binning, base = 2): if isinstance(binning, int): # in case have only specified the number of bins to use, generate an actual binning here binning = np.linspace(np.min([p, q]), np.max([p, q]), num = binning, endpoint = True) # first, need to bin p and q to get two "probability vectors" that can be easily compared p_binned, _ = np.histogram(np.clip(p, binning[0], binning[-1]), bins = binning, weights = p_weights, density = True) q_binned, _ = np.histogram(np.clip(q, binning[0], binning[-1]), bins = binning, weights = q_weights, density = True) # make sure they do not contain negative entries p_binned = np.maximum(p_binned, 0.0) q_binned = np.maximum(q_binned, 0.0) # renormalize p_binned /= np.sum(p_binned) q_binned /= np.sum(q_binned) # this code is taken (almost) verbatim from https://github.com/scipy/scipy/blob/c42462a/scipy/spatial/distance.py#L1239-L1296 m_binned = (p_binned + q_binned) / 2.0 left = rel_entr(p_binned, m_binned) right = rel_entr(q_binned, m_binned) js = np.sum(left, axis = 0) + np.sum(right, axis = 0) if base is not None: js /= np.log(base) return js
def jensenshannon(p, q, base=None): """ Compute the Jensen-Shannon distance (metric) between two 1-D probability arrays. This is the square root of the Jensen-Shannon divergence. The Jensen-Shannon distance between two probability vectors `p` and `q` is defined as, .. math:: \\sqrt{\\frac{D(p \\parallel m) + D(q \\parallel m)}{2}} where :math:`m` is the pointwise mean of :math:`p` and :math:`q` and :math:`D` is the Kullback-Leibler divergence. This routine will normalize `p` and `q` if they don't sum to 1.0. Parameters ---------- p : (N,) array_like left probability vector q : (N,) array_like right probability vector base : double, optional the base of the logarithm used to compute the output if not given, then the routine uses the default base of scipy.stats.entropy. Returns ------- js : double The Jensen-Shannon distance between `p` and `q` .. versionadded:: 1.0.2 Examples -------- >>> from scipy.spatial import distance >>> distance.jensenshannon([1.0, 0.0, 0.0], [0.0, 1.0, 0.0], 2.0) 1.0 >>> distance.jensenshannon([1.0, 0.0], [0.5, 0.5]) 0.46450140402245893 >>> distance.jensenshannon([1.0, 0.0, 0.0], [1.0, 0.0, 0.0]) 0.0 """ p = np.asarray(p) q = np.asarray(q) p = p / np.sum(p, axis=0) q = q / np.sum(q, axis=0) m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = np.sum(left, axis=0) + np.sum(right, axis=0) if base is not None: js /= np.log(base) return np.sqrt(js / 2.0)
def jensenshannon(p, q): p = np.asarray(p) q = np.asarray(q) p = p / np.sum(p, axis=0) q = q / np.sum(q, axis=0) m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = np.sum(left, axis=0) + np.sum(right, axis=0) return np.sqrt(js / 2.0)
def js(p, q): """Calculate Jensen-Shannon Distance between ground truth array p and privatized array q. """ m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = np.sum(left, axis=0) + np.sum(right, axis=0) js /= np.log(2) return np.sqrt(js / 2.0)
def js_div(px, py): """ Jensen-Shannon Divergence, which is a smoothed version of KL divergence. px: Probability of x (float or array of floats) py: Probability of y (float or array of floats) """ midpoint = (px + py) * 0.5 js = rel_entr(px, midpoint) * 0.5 + rel_entr(py, midpoint) * 0.5 return np.sum(js)
def jensenshannon(p, q, base=None): p = np.asarray(p) q = np.asarray(q) p = p / np.sum(p, axis=0) q = q / np.sum(q, axis=0) m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = np.sum(left, axis=0) + np.sum(right, axis=0) if base is not None: js /= np.log(base) return np.sqrt(js / 2.0)
def kl_divergence(p, q, axis=0): """Compute KL divergence (in bits) between p and q, DKL(P||Q).""" p = np.asarray(p) p = 1.0 * p / np.sum(p, axis=axis, keepdims=True) q = np.asarray(q) q = 1.0 * q / np.sum(q, axis=axis, keepdims=True) return np.sum(rel_entr(p, q), axis=axis) / np.log(2)
def jensenshannon(p, q, base=None): """ Returns the JS divergence between two 1-dimensional probability vectors, code taken from scipy and modified to fix bug """ p = np.asarray(p) q = np.asarray(q) p = p / np.sum(p, axis=0) q = q / np.sum(q, axis=0) m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = max(0, np.sum(left, axis=0) + np.sum(right, axis=0)) if base is not None: js /= np.log(base) return np.sqrt(js / 2.0)
def calc_kl(df, pop, stat, col2): """Compare two prob distributions. https://machinelearningmastery.com/divergence-between-probability-distributions/ Parameters ---------- df : TYPE DESCRIPTION. pop : TYPE DESCRIPTION. stat : TYPE DESCRIPTION. col2 : TYPE DESCRIPTION. Returns ------- None. """ dfpop = df[df["pops"] == pop] stats_list = dfpop[stat].unique() for i in stats_list: obs = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "obs")] sim = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "sim")] ent = sum(rel_entr(obs[col2].values, sim[col2].values)) kl = sum(kl_div(obs[col2].values, sim[col2].values)) js = jensenshannon(obs[col2].values, sim[col2].values, base=2) print(f"{stat} {i}: rel_entr {ent}, KL_div {kl}, js_dist {js} bits")
def objective_log_linear(weights): """General objective function for log-linear pooling (Abbas 2009 (9)) Parameters ---------- weights : numeric or array_like Input data. Returns ------- result : float Log-linear pooled probability. """ # Compute log-linear pooled prob with given weights pooling_pooled, pooling_reg_const = log_linear_pooling(P, weights) # Compute log-linear payoff (Abbas (9)) (here higher is worse) kls = np.zeros(nviews) pooling_pooled_p = 1.0 * pooling_pooled / np.sum(pooling_pooled) for i, qk in enumerate(P): qk = 1.0 * qk / np.sum(qk) vec = rel_entr(pooling_pooled_p, qk) kls[i] = np.sum(vec) payoff = np.sum(np.dot(kls, weights)) # Introduce constraint sum(weights)=1 through a penalty penalty = abs(1 - np.sum(weights)) goal = payoff + penalty return (-goal)
def cost_KL(relevance, freq_lists, ideal_proportions_lists, weight_list, doc_list): cost = weight_list[0] * relevance #relevance is normalized between 0 and 1 proportion_lists = [] new_freq_lists = [] for i in range(len(freq_lists)): new_freq_lists.append(freq_lists[i].copy()) for j in range(len(new_freq_lists[i])): new_freq_lists[i][j] += doc_list[i][j] for fl in new_freq_lists: pl = [] for freq in fl: if sum(fl) > 0: pl.append(freq / sum(fl)) else: pl.append(freq) proportion_lists.append(pl) #print("Doc List: " + str(doc_list)) #print("New Freq Lists: " + str(new_freq_lists)) #print("Prop lists: " + str(proportion_lists)) for i, pl in enumerate(proportion_lists): #print("rel_entr for " + str(pl) + str(sum(special.rel_entr(pl, ideal_proportions_lists[i])))) #print("weight: " + str(weight_list[i + 1])) cost += sum(special.rel_entr(pl, ideal_proportions_lists[i]) ) * weight_list[i + 1] # +1 since relevance is weight[0] #print("Cost: " + str(cost) + "\n") return cost
def get_kl_divergence(input1, input2): c1, c2 = collections.Counter(input1), collections.Counter(input2) d1, d2 = [], [] for key in c1.keys(): d1.append(c1[key] / len(input1)) d2.append(c2[key] / len(input2)) return sum(sp.rel_entr(d1, d2))
def get_kld(dat, ref, dist_name, bins=75): """ Find the Kullback-Leibler divergence from ~ref~ to ~dat~ using SciPy's rel_entr() function. A hypothesized distribution is required, as this function fits bot the ref and the data to the same probability distribution (fitted separately): ~dist_name~ should be one of SciPy's probability distributions (https://docs.scipy.org/doc/scipy/reference/stats.html). """ dist = getattr(stats, dist_name) y, x = np.histogram(ref, bins=bins) x = (x + np.roll(x, -1))[:-1] / 2.0 #y = y/np.sum(y) d_params = dist.fit(dat) d_args = d_params[:-2] d_loc = d_params[-2] d_scale = d_params[-1] dpdf = dist.pdf(x, loc=d_loc, scale=d_scale, *d_args) r_params = dist.fit(ref) r_args = r_params[:-2] r_loc = r_params[-2] r_scale = r_params[-1] rpdf = dist.pdf(x, loc=r_loc, scale=r_scale, *r_args) # dw = dist.fit(dat) # returns params # rw = dist.fit(ref) # dpdf = dist.pdf(x, c = dw[0], loc = dw[1], scale = dw[2]) # rpdf = dist.pdf(x, c = rw[0], loc = rw[1], scale = rw[2]) dy = dpdf / np.sum(dpdf) ry = rpdf / np.sum(rpdf) return sum(rel_entr(dy, ry))
def metric(real, synthetic): """ This approximates the KL divergence by binning the continuous values to turn them into categorical values and then computing the relative entropy. TODO: * Investigate a KDE-based approach. Arguments: real (np.ndarray): The values from the real database. synthetic (np.ndarray): The values from the synthetic database. Returns: (str, Goal, str, tuple): A tuple containing (value, goal, unit, domain) which corresponds to the fields in a Metric object. """ real[np.isnan(real)] = 0.0 synthetic[np.isnan(synthetic)] = 0.0 real, xedges, yedges = np.histogram2d(real[:, 0], real[:, 1]) synthetic, _, _ = np.histogram2d(synthetic[:, 0], synthetic[:, 1], bins=[xedges, yedges]) f_obs, f_exp = synthetic.flatten() + 1e-5, real.flatten() + 1e-5 f_obs, f_exp = f_obs / np.sum(f_obs), f_exp / np.sum(f_exp) value = np.sum(rel_entr(f_obs, f_exp)) return value, Goal.MINIMIZE, "entropy", (0.0, float("inf"))
def violation(self, norm_ord=np.inf, rough=False): """ Return a measure of violation for the constraint that ``self.v`` belongs to :math:`C_{\\mathrm{SAGE}}(\\alpha, X)^{\\dagger}`. Parameters ---------- norm_ord : int The value of ``ord`` passed to numpy ``norm`` functions, when reducing vector-valued residuals into a scalar residual. rough : bool Setting ``rough=False`` computes violation by solving an optimization problem. Setting ``rough=True`` computes violation by taking norms of residuals of appropriate elementwise equations and inequalities involving ``self.v`` and auxiliary variables. Notes ----- When ``rough=False``, the optimization-based violation is computed by projecting the vector ``self.v`` onto a new copy of a dual SAGE constraint, and then returning the L2-norm between ``self.v`` and that projection. This optimization step essentially re-solves for all auxiliary variables used by this constraint. """ v = self.v.value viols = [] for i in self.ech.U_I: selector = self.ech.expcovers[i] num_cover = self.ech.expcover_counts[i] if num_cover > 0: expr1 = np.tile(v[i], num_cover).ravel() expr2 = v[selector].ravel() lowerbounds = special_functions.rel_entr(expr1, expr2) mat = -(self.alpha[selector, :] - self.alpha[i, :]) mu_i = self._lifted_mu_vars[i].value # compute rough violation for this dual AGE cone residual = mat @ mu_i[:self._n] - lowerbounds residual[residual >= 0] = 0 curr_viol = np.linalg.norm(residual, ord=norm_ord) if (self.X is not None) and (not np.isnan(curr_viol)): AbK_val = self.X.A @ mu_i + v[i] * self.X.b AbK_viol = PrimalProductCone.project(AbK_val, self.X.K) curr_viol += AbK_viol # as applicable, solve an optimization problem to compute the violation. if (curr_viol > 0 or np.isnan(curr_viol)) and not rough: temp_var = Variable(shape=(self._lifted_n,), name='temp_var') cons = [mat @ temp_var[:self._n] >= lowerbounds] if self.X is not None: con = PrimalProductCone(self.X.A @ temp_var + v[i] * self.X.b, self.X.K) cons.append(con) prob = Problem(CL_MIN, Expression([0]), cons) status, value = prob.solve(verbose=False) if status in {CL_SOLVED, CL_INACCURATE} and abs(value) < 1e-7: curr_viol = 0 viols.append(curr_viol) else: viols.append(0) viol = max(viols) return viol
def metric(real, synthetic): assert real.shape[1] == 2, "Expected 2d data." assert synthetic.shape[1] == 2, "Expected 2d data." real = [(x[0], x[1]) for x in real] synthetic = [(x[0], x[1]) for x in synthetic] f_obs, f_exp = frequencies(real, synthetic) value = np.sum(rel_entr(f_obs, f_exp)) return value, Goal.MINIMIZE, "entropy", (0.0, float("inf"))
def get_kl(pk, qk): pk = asarray(pk) pk = 1.0 * pk / np.sum(pk, axis=0) qk = asarray(qk) qk = 1.0 * qk / np.sum(qk, axis=0) vec = rel_entr(pk, qk) S = np.sum(vec, axis=0) return S
def jsd(p_distb, q_distb): #, base=None): """Jensen Shannon Distance Args: p_distb (array): first vector (discrete distribution) q_distb (array): second vector Returns: Jensen Shannon Distance """ p = np.asarray(p_distb) #makes almost no difference to leave this out q = np.asarray(q_distb) m = (p + q) / 2.0 left = rel_entr(p, m) right = rel_entr(q, m) js = np.sum(left, axis=0) + np.sum(right, axis=0) return np.sqrt(js / 2.0)
def np_jensenshannon_divergence(X, Y, base=None): """Compute Jensen-Shannon Divergence Parameters ---------- X : array-like possibly unnormalized distribution. Y : array-like possibly unnormalized distribution. Must be of same shape as ``X``. Returns ------- j : float See Also -------- entropy : function Computes entropy and K-L divergence """ X, Y = np.atleast_2d(X), np.atleast_2d(Y) m = .5 * (X + Y) js = np.sum(rel_entr(X, m) + rel_entr(Y, m), axis=1) if base is not None: js /= np.log(base) return .5 * js
def kldiv_neighbor_dists(data_matrix, query_matrix_batch): """Compute values of the KL-divergence for dense vectors. :param data_matrix: data matrix :param query_matrix_batch: query matrix :return: an output in the shape <#of queries> X min(K, <# of data points>) """ dists_batch = [] for k in range(len(query_matrix_batch)): v = rel_entr(data_matrix, query_matrix_batch[k]) dists_batch.append(np.sum(v, axis=-1)) return dists_batch
def test_ordinary_sage_primal_2(self): n, m = 2, 6 np.random.seed(0) alpha = 1 * np.random.randn(m - 1, n) conv_comb = np.random.rand(m - 1) conv_comb /= np.sum(conv_comb) alpha_last = alpha.T @ conv_comb alpha = np.row_stack([alpha, alpha_last]) c0 = np.array([1, 2, 3, 4, -0.5, -0.1]) c = Variable(shape=(m, ), name='projected_c0') t = Variable(shape=(1, ), name='epigraph_var') sage_constraint = sage_cones.PrimalSageCone(c, alpha, X=None, name='test') epi_constraint = vector2norm(c - c0) <= t constraints = [sage_constraint, epi_constraint] prob = Problem(CL_MIN, t, constraints) prob.solve(solver='ECOS') # constraint violations v0 = sage_constraint.violation(norm_ord=1, rough=False) assert v0 < 1e-6 v1 = sage_constraint.violation(norm_ord=np.inf, rough=True) assert v1 < 1e-6 # certificates w4 = sage_constraint.age_witnesses[4].value c4 = sage_constraint.age_vectors[4].value drop4 = np.array([True, True, True, True, False, True]) level4 = np.sum(rel_entr(w4[drop4], np.exp(1) * c4[drop4])) - c4[4] assert level4 < 1e-6 w5 = sage_constraint.age_witnesses[5].value c5 = sage_constraint.age_vectors[5].value drop5 = np.array([True, True, True, True, True, False]) level5 = np.sum(rel_entr(w5[drop5], np.exp(1) * c5[drop5])) - c5[5] assert level5 < 1e-6
def scipy_entropy(pk, qk=None, base=None): pk = np.asarray(pk) pk = 1.0 * pk / np.sum(pk, axis=0) if qk is None: vec = special.entr(pk) else: qk = np.asarray(qk) if len(qk) != len(pk): raise ValueError("qk and pk must have same length.") qk = 1.0 * qk / np.sum(qk, axis=0) vec = special.rel_entr(pk, qk) S = np.sum(vec, axis=0) if base is not None: S /= np.log(base) return S
def compute_kl_div_neighbors(data_matrix, query_matrix, K): """Compute neighbors for the KL-divergence. By default, in NMSLIB, queries are left, i.e., the data object is the first (left) argument. :param data_matrix: data matrix :param query_matrix: query matrix :param K: the number of neighbors :return: an output in the shape <#of queries> X min(K, <# of data points>) """ dists = [] for i in range(len(query_matrix)): v = rel_entr(data_matrix, query_matrix[i]) dists.append(np.sum(v, axis=-1)) return get_neighbors_from_dists(np.stack(dists, axis=0), K)
def kl(hists, p): # p: index of chosen image # h: index of most similar image to chosen image most_similar = { 'KL': float('inf'), # initialize KL 'P': p, 'Q1': 0, 'Q2': 0 } for h in range(len(hists)): if h == p: continue kl_now = sum(rel_entr(hists[p], hists[h])) if kl_now < most_similar['KL']: most_similar['KL'] = kl_now most_similar['Q2'] = most_similar['Q1'] most_similar['Q1'] = h return most_similar
def test_relent_1(self): # compilation and evaluation x = Variable(shape=(2, ), name='x') y = Variable(shape=(2, ), name='y') re = relent(2 * x, np.exp(1) * y) con = [re <= 10, 3 <= x, x <= 5] # compilation A, b, K, _, _, _ = compile_constrained_system(con) A_expect = np.array([ [0., 0., 0., 0., -1., -1.], # linear inequality on epigraph for relent constr [1., 0., 0., 0., 0., 0.], # bound constraints on x [0., 1., 0., 0., 0., 0.], # [-1., 0., 0., 0., 0., 0.], # more bound constraints on x [0., -1., 0., 0., 0., 0.], # [0., 0., 0., 0., -1., 0.], # first exponential cone [0., 0., 2.72, 0., 0., 0.], # [2., 0., 0., 0., 0., 0.], # [0., 0., 0., 0., 0., -1.], # second exponential cone [0., 0., 0., 2.72, 0., 0.], # [0., 2., 0., 0., 0., 0.] ]) # A = np.round(A.toarray(), decimals=2) assert np.all(A == A_expect) assert np.all( b == np.array([10., -3., -3., 5., 5., 0., 0., 0., 0., 0., 0.])) assert K == [ Cone('+', 1), Cone('+', 2), Cone('+', 2), Cone('e', 3), Cone('e', 3) ] # value propagation x0 = np.array([1, 2]) x.value = x0 y0 = np.array([3, 4]) y.value = y0 actual = re.value expect = np.sum(rel_entr(2 * x0, np.exp(1) * y0)) assert abs(actual - expect) < 1e-7
def run_transform_operations(x: np.ndarray, y: np.ndarray) -> Dict: """ :param x: :param y: :return: """ if isinstance(x, pd.Series): x = x.values if isinstance(y, pd.Series): y = y.values p_diff = percentage_difference(x, y) cross_corr = signal.correlate(x, y) conv_x_y = signal.convolve(x, y) relative_entropy = rel_entr(x, y) if any(np.isnan(x)): x[np.isnan(x)] = np.nanmedian(x) if any(np.isnan(y)): y[np.isnan(y)] = np.nanmedian(y) xy_density, _, _ = np.histogram2d(x, y, normed=True) marginal_density = np.apply_along_axis(np.nanmean, axis=1, arr=xy_density) xy_transformed = \ dict(pdiff=p_diff, ccorr=cross_corr, conv=conv_x_y, density=marginal_density, entropy=relative_entropy) return xy_transformed
def kl_divergence(self): """ This metric is also defined at the variable level and examines whether the distributions of the attributes are identical and measures the potential level of discrepancy between them. The threshold limit for this metric is a value below 2""" target_columns = list(self.origdst.columns[11:-3]) target_columns.append(self.origdst.columns[1]) # channel target_columns.append(self.origdst.columns[2]) # program_title target_columns.append(self.origdst.columns[3]) # genre kl_dict = {} for col in target_columns: try: col_counts_orig = self.origdst[col].value_counts( normalize=True).sort_index(ascending=True) col_counts_synth = self.synthdst[col].value_counts( normalize=True).sort_index(ascending=True) kl = sum( rel_entr(col_counts_orig.tolist(), col_counts_synth.tolist())) kl_dict[col] = kl except: print( 'For the column ', col, ' you must generate the same unique values as the real dataset.' ) print( 'The number of unique values than you should generate for column ', col, 'is ', len(self.origdst[col].unique())) return kl_dict
def KL_divergence(data_i=0): """ Relative Entropy b/w P: True distribution = “motif.txt” and Q: Model Predicted/Approximate distribution = “predictedmotif.txt” Output: : D_KL(P||Q) """ motif = import_motif('' + fileprefix + 'results/dataset' + str(data_i) + '/motif.txt') # list of lists predictedmotif = import_motif('' + fileprefix + 'results/dataset' + str(data_i) + '/predictedmotif.txt') rel_ent = 0 for i in range(len(motif[1:-1])): # compare each row (ACGT) against each other in two matrices # row_diff = sum((motif[i][j] * log(motif[i][j]/predictedmotif[i][j]) for j in range(len(motif[i])))) row_diff = rel_entr(motif[i], predictedmotif[i]) rel_ent += row_diff return rel_ent
def get_KLD(data,probe_state,trial_num): probe_rep = state_reps[probe_state] KLD_array = np.zeros(env.shape) KLD_array[:] = np.nan entropy_array = np.zeros(env.shape) entropy_array[:] = np.nan ec_pol_grid = np.zeros((*env.shape,4))#np.zeros(env.shape, dtype=[(x, 'f8') for x in env.action_list]) blank_mem = Memory(cache_limit=400, entry_size=4) blank_mem.cache_list = data['ec_dicts'][trial_num] probe_pol = blank_mem.recall_mem(probe_rep) #for k in state_reps.keys(): #sr_rep = state_reps[k] for sr_rep in blank_mem.cache_list.keys(): k = blank_mem.cache_list[sr_rep][2] pol = blank_mem.recall_mem(sr_rep) twoD = env.oneD2twoD(k) KLD_array[twoD] = sum(rel_entr(list(probe_pol),list(pol))) ec_pol_grid[twoD][:] = pol entropy_array[twoD] = entropy(pol,base=2) return KLD_array,ec_pol_grid,entropy_array
# example of calculating the kl divergence (relative entropy) with scipy from scipy.special import rel_entr # define distributions p = [0.10, 0.40, 0.50] q = [0.80, 0.15, 0.05] # calculate (P || Q) kl_pq = rel_entr(p, q) print('KL(P || Q): %.3f nats' % sum(kl_pq)) # calculate (Q || P) kl_qp = rel_entr(q, p) print('KL(Q || P): %.3f nats' % sum(kl_qp))
def JSD(P, Q): M = 0.5 * (P + Q) return 0.5 * (sum(special.rel_entr(P, M)) + sum(special.rel_entr(Q, M)))