def gethistograms(novice_array, intermediary_array, expert_array): manip = 2 span = int(novice_array.shape[1] / 2) kldiff = list() jsdiff = list() #print (expert_array.shape) #print (intermediary_array.shape) #print (novice_array.shape) common_range, grid = findrange(expert_array, intermediary_array) intermediary_kernel = (ss.gaussian_kde(intermediary_array.T) ) #.evaluate([2, 2, 2, 2])) intermediary_values = intermediary_kernel.evaluate(grid) expert_kernel = (ss.gaussian_kde(expert_array.T) ) #.evaluate([2, 2, 2, 2])) expert_values = expert_kernel.evaluate(grid) novice_kernel = (ss.gaussian_kde(novice_array.T) ) #.evaluate([2, 2, 2, 2])) novice_values = novice_kernel.evaluate(grid) js_diff_ne = distance.jensenshannon(novice_values, expert_values) #print ("novice expert jsdiff {}".format(js_diff_ne)) js_diff_ie = distance.jensenshannon(intermediary_values, expert_values) #print ("intermediary expert jsdiff {}".format(js_diff_ie)) js_diff_ni = distance.jensenshannon(novice_values, intermediary_values) #print ("novice intermediary jsdiff {}".format(js_diff_ni)) return js_diff_ne, js_diff_ie, js_diff_ni
def check_constraints(sample, probability_matrix): s_in, s_out = empirical_strengths(sample.full.edges, sample.full.nodes, marginalized=True) """ Compute deviation of s_in/s_out from expected values. Return dict with MAE and weighted MAPE. """ _s_in = probability_matrix.sum(axis=1) _s_out = probability_matrix.sum(axis=0) corr_in, pv_in = pearsonr(s_in, _s_in) corr_out, pv_out = pearsonr(s_out, _s_out) spcorr_in, sp_pv_in = spearmanr(s_in, _s_in) spcorr_out, sp_pv_out = spearmanr(s_out, _s_out) return dict( s_in_mae=mean_absolute_error(s_in, _s_in), s_out_mae=mean_absolute_error(s_out, _s_out), s_in_mape=mean_absolute_percentage_error(s_in, _s_in), s_out_mape=mean_absolute_percentage_error(s_out, _s_out), # r2_in=r2_score(s_in, _s_in), # r2_out=r2_score(s_out, _s_out), s_in_js=jensenshannon(s_in, _s_in, base=2)**2, s_out_js=jensenshannon(s_out, _s_out, base=2)**2, corr_in=corr_in, corr_out=corr_out, spcorr_in=spcorr_in, spcorr_out=spcorr_out, pv_in=pv_in, pv_out=pv_out, sp_pv_in=sp_pv_in, sp_pv_out=sp_pv_out, )
def getJS_mn(seqs, m, n, I, J, P_j, k, p, alphabet, inv=False): #Function that calculates the Jensen-Shannon divergence between k-mer distributions starting at position j #and all other positions after j #input parameters: #seqs = list of lists of all input sequences #m = position index 1 for the pair #n = position index 2 for the pair #I = total number of sequences #J = length of sequences #P = singe site k-mer frequencies #k = k-mer length #p = pseudocount mass #alphabet = alphabet used JS = 0.0 JS_mn = {} #dictionary containing all individual contributions to JS kmers = set() for kmer_m in P_j[m]: kmers.add(kmer_m) for kmer_n in P_j[n]: kmers.add(kmer_n) probs_m = [] probs_n = [] for kmer in kmers: if kmer in P_j[m]: probs_m.append(P_j[m][kmer]) else: probs_m.append(0.0) if kmer in P_j[n]: probs_n.append(P_j[n][kmer]) else: probs_n.append(0.0) if not inv: JS = jensenshannon(probs_m, probs_n, 2) #using base 2 for logarithm for bits else: JS = 1 - jensenshannon( probs_m, probs_n, 2) #inverse of JS distance (high values mean similarity) return [(m, n), JS, JS_mn, None]
def JSdistance(p, q, base=None): """ calculates the Jenson-Shannon distance between two distributions Input: p: a list of floats or numpy array specifying a pdf or pmf q: a list of floats or numpy array specifying a pdf or pmf base: optional argument specifying the base of the logarithm """ if base is not None: output = distance.jensenshannon(p, q, base) else: output = distance.jensenshannon(p, q) return output
def js_metric(df1, df2, numerical_columns, categorical_columns): res = {} STEPS = 100 for col in categorical_columns: col_baseline = df1[col].to_frame() col_sample = df2[col].to_frame() col_baseline["source"] = "baseline" col_sample["source"] = "sample" col_ = pd.concat([col_baseline, col_sample], ignore_index=True) arr = (col_.groupby([col, "source"]).size().to_frame().reset_index().pivot( index=col, columns="source").droplevel(0, axis=1)) arr_ = arr.div(arr.sum(axis=0), axis=1) arr_.fillna(0, inplace=True) js_distance = jensenshannon(arr_["baseline"].to_numpy(), arr_["sample"].to_numpy()) res.update({col: js_distance}) for col in numerical_columns: # fit guassian_kde col_baseline = df1[col] col_sample = df2[col] kde_baseline = gaussian_kde(col_baseline) kde_sample = gaussian_kde(col_sample) # get range of values min_ = min(col_baseline.min(), col_sample.min()) max_ = max(col_baseline.max(), col_sample.max()) range_ = np.linspace(start=min_, stop=max_, num=STEPS) # sample range from KDE arr_baseline_ = kde_baseline(range_) arr_sample_ = kde_sample(range_) arr_baseline = arr_baseline_ / np.sum(arr_baseline_) arr_sample = arr_sample_ / np.sum(arr_sample_) # calculate js distance js_distance = jensenshannon(arr_baseline, arr_sample) res.update({col: js_distance}) list_output = sorted(res.items(), key=lambda x: x[1], reverse=True) dict_output = dict(list_output) return dict_output
def score(self, X, y=None, j_ways=1): """Return jensen-shannon distance Parameters ---------- X : synth array, shape(n_samples, n_features) The data. y : None Ignored variable. Returns ------- """ # marginal_domain = get_marginal_domain(y, j_ways) # marginal_counts_X = count_marginals(X, marginal_domain) # marginal_counts_y = count_marginals(y, marginal_domain) # # synth_hist, synth_bins = sts.contingency_table(X, epsilon=np.inf(float)) self.feature_distances = {} average_feature_distance = np.empty_like(X.columns) for i, c in enumerate(X.columns): counts_X, counts_y = X[c].value_counts(dropna=False).align(y[c].value_counts(dropna=False), join='outer', axis=0, fill_value=0) js_distance = jensenshannon(counts_X, counts_y) average_feature_distance[i] = js_distance self.feature_distances[c] = js_distance average_feature_distance = np.sum(average_feature_distance) / len(X.columns) return average_feature_distance
def degree_js(dataset, model, agg): rows = {'dataset': [], 'model': [], 'trial': [], 'gen': [], 'degree_js': []} for trial in agg.keys(): dist1 = agg[trial][0] for gen in agg[trial].keys(): if gen != 0: dist2 = agg[trial][gen] union = set(dist1.keys()) | set(dist2.keys()) for key in union: dist1[key] = dist1.get(key, 0) dist2[key] = dist1.get(key, 0) deg1 = np.asarray(list(dist1.values())) + 0.00001 deg2 = np.asarray(list(dist2.values())) + 0.00001 deg_js = distance.jensenshannon(deg1, deg2, base=2.0) rows['dataset'].append(dataset) rows['model'].append(model) rows['trial'].append(trial) rows['gen'].append(gen) rows['degree_js'].append(deg_js) return rows
def jensen_shannon(query, matrix): p = query[:, None] # original shape of query was (100,) , which means ---> (number of topics,) print(p.shape) # shape becomes (100,1) q = matrix.T # transpose matrix print(q.shape) # shape --> (number of topics, total documents) return jensenshannon(p, q)
def calc_jsd(args, X, G, dist): print("evaluating JSD") G.eval() bins = [np.arange(-1, 1, 0.02), np.arange(-1, 1, 0.02), np.arange(-1, 1, 0.01)] N = len(X) jsds = [] for j in tqdm(range(10)): gen_out = utils.gen(args, G, dist=dist, num_samples=args.batch_size).cpu().detach().numpy() for i in range(int(args.num_samples / args.batch_size)): gen_out = np.concatenate((gen_out, utils.gen(args, G, dist=dist, num_samples=args.batch_size).cpu().detach().numpy()), 0) gen_out = gen_out[:args.num_samples] sample = X[rng.choice(N, size=args.num_samples, replace=False)].cpu().detach().numpy() jsd = [] for i in range(3): hist1 = np.histogram(gen_out[:, :, i].reshape(-1), bins=bins[i], density=True)[0] hist2 = np.histogram(sample[:, :, i].reshape(-1), bins=bins[i], density=True)[0] jsd.append(jensenshannon(hist1, hist2)) jsds.append(jsd) return np.mean(np.array(jsds), axis=0), np.std(np.array(jsds), axis=0)
def calculate_all_results(demos, approach): final_salads_demo, avg_length_demo = calculate_results_for_demos(demos) final_salads, avg_length = calculate_results_for_approach( demos, approach, final_salads_demo) for k in final_salads.keys(): if k not in final_salads_demo.keys(): final_salads_demo[k] = 0 demo_dist = [] approach_dist = [] demo_count = 0 approach_count = 0 for k in final_salads_demo.keys(): approach_count += final_salads[k] demo_count += final_salads_demo[k] if k != 'invalid': approach_dist.append(final_salads[k]) demo_dist.append(final_salads_demo[k]) approach_dist.append(final_salads['invalid']) demo_dist.append(final_salads_demo['invalid']) for i in range(len(approach_dist)): approach_dist[i] /= float(approach_count) for i in range(len(demo_dist)): demo_dist[i] /= float(demo_count) # print() # print(demo_dist) # print(approach_dist) print(approach_dist[len(approach_dist) - 1], distance.jensenshannon(demo_dist, approach_dist), avg_length - avg_length_demo)
def compute_jensen_shannon_divergence(vec1, vec2): print(vec1) print(vec2) vec1 = np.round(vec1, 2) vec2 = np.round(vec2, 2) min_vec1 = min(vec1) min_vec2 = min(vec2) max_vec1 = max(vec1) max_vec2 = max(vec2) mini = min_vec1 maxi = max_vec1 if min_vec1 > min_vec2: mini = min_vec2 if max_vec1 < max_vec2: maxi = max_vec2 b = [] i = mini while i <= maxi + 0.1: b.append(i) i = i + 0.1 print('Bins: ', b) p = np.histogram(vec1, bins=b)[0] / len(vec1) print(np.histogram(vec1, bins=b)[0]) q = np.histogram(vec2, bins=b)[0] / len(vec2) print(np.histogram(vec2, bins=b)[0]) print(p) print(q) score = distance.jensenshannon(p, q)**2 return score
def _penalty_components(self, actual, predicted): """ Score one row of counts for a particular (neighborhood, year, month) """ if (actual == predicted).all(): return 0.0, 0.0, 0.0 # get the overall penalty for bias bias_mask = np.abs(actual.sum() - predicted.sum()) > self.allowable_raw_bias bias_penalty = self.bias_penalty if bias_mask.any() else 0 # zero out entries below the threshold gt = self._zero_below_threshold(actual).ravel() dp = self._zero_below_threshold(predicted).ravel() # get the base Jensen Shannon distance; add a tiny bit of weight to each bin in order # to avoid all-zero arrays (and thus NaNs) without unduly influencing the distribution # induced by normalizing the arrays (dividing by sums); use base 2 to get a proper # spread of scores between [0, 1] instead of default base e which is more like [0, 0.83] # # ref: https://docs.scipy.org/doc/scipy-1.5.2/reference/generated/scipy.spatial.distance.jensenshannon.html jsd = jensenshannon(gt + 1e-9, dp + 1e-9, base=2) # get the overall penalty for the presence of misleading counts misleading_presence_mask = (gt == 0) & (dp > 0) misleading_presence_penalty = (misleading_presence_mask.sum() * self.misleading_presence_penalty) return jsd, misleading_presence_penalty, bias_penalty
def js_bootstrap(key, set_1, set_2, nsamples, ntests): ''' key: string posterior parameter set_1: first full posterior samples set set_2: second full posterior samples set nsamples: number for downsampling full sample set ntests: number of iterations over different nsamples realisations returns: 1 dim array (ntests) ''' js_array = np.zeros(ntests) for j in tqdm(range(ntests)): nsamples = min([nsamples, len(set_1[key]), len(set_2[key])]) lp = np.random.choice(set_1[key], size=nsamples, replace=False) bp = np.random.choice(set_2[key], size=nsamples, replace=False) x = np.atleast_2d(np.linspace(np.min([np.min(bp), np.min(lp)]),np.max([np.max(bp), np.max(lp)]),100)).T xlow = np.min(x) xhigh = np.max(x) if key in default_bounds.keys(): bounds = default_bounds[key] if "low" in bounds.keys(): xlow = bounds["low"] if "high" in bounds.keys(): if isinstance(bounds["high"], str) and "mass_1" in bounds["high"]: xhigh = np.max(x) else: xhigh = bounds["high"] set_1_pdf = Bounded_1d_kde(bp, xlow=xlow, xhigh=xhigh)(x) set_2_pdf = Bounded_1d_kde(lp, xlow=xlow, xhigh=xhigh)(x) js_array[j] = np.nan_to_num(np.power(jensenshannon(set_1_pdf, set_2_pdf), 2)) return js_array
def predict(self, data, embedding_epochs_labeled=None): labels = self.gmm.predict(data) epoch_labels = set(embedding_epochs_labeled) sense_frequencies = self.compute_cluster_sense_frequency(labels, embedding_epochs_labeled, epoch_labels) task_1_answer = int(any([True for sd in sense_frequencies if 0 in sense_frequencies[sd]])) task_2_answer = distance.jensenshannon(sense_frequencies[0], sense_frequencies[1], 2.0) return task_1_answer, task_2_answer
def get_k_nearest_docs(doc_dist, k=5): distances = topic_dist.apply(lambda x: jensenshannon(x, doc_dist), axis=1) # else: #diff_df = topic_dist.sub(doc_dist) #distances = np.sqrt(np.square(diff_df).sum(axis=1)) # euclidean distance return distances[distances != 0].nsmallest(n=k).index
def calc_kl(df, pop, stat, col2): """Compare two prob distributions. https://machinelearningmastery.com/divergence-between-probability-distributions/ Parameters ---------- df : TYPE DESCRIPTION. pop : TYPE DESCRIPTION. stat : TYPE DESCRIPTION. col2 : TYPE DESCRIPTION. Returns ------- None. """ dfpop = df[df["pops"] == pop] stats_list = dfpop[stat].unique() for i in stats_list: obs = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "obs")] sim = dfpop[(dfpop[stat] == i) & (dfpop["df_id"] == "sim")] ent = sum(rel_entr(obs[col2].values, sim[col2].values)) kl = sum(kl_div(obs[col2].values, sim[col2].values)) js = jensenshannon(obs[col2].values, sim[col2].values, base=2) print(f"{stat} {i}: rel_entr {ent}, KL_div {kl}, js_dist {js} bits")
def _recurse_spn_local_rules(node, ): #first bottom up: for c in node.children: if isinstance(c, Sum) or isinstance(c, Product): # yield from _recurse_spn_local_rules(c) for e in _recurse_spn_local_rules(c): yield e #then local rule if node.id != 0: local_targets = set(target_vars).intersection(set(node.scope)) for target in local_targets: p = p_from_scope(node, target, value_dict) # if categorical TODO non-categorical data? if max(p) >= self.min_local_p: # prior = prior_dist[leaf.scope[0]] # prior = [1 - prior, prior]7 prior = self.prior_gen.calculate_prior( spn, target, value_dict) js = jensenshannon( p, prior, ) if js >= self.min_target_js: other_vars = node.scope.copy() other_vars.pop(other_vars.index(target)) # yield from _leaves_target_allrules(node, target, other_vars, value_dict, root=spn, targetp=p) for res in _leaves_target_allrules(node, target, other_vars, value_dict, root=spn, targetp=p): yield res
def calculate_label_distribution_similarity(x: List[Example], y: List[Example]) -> float: """Calculate the similarity of the label distribution for 2 datasets. e.g. This can help you understand how well your train set models your dev and test sets. Empircally you want a similarity over **0.8** when comparing your train set to each of your dev and test sets. calculate_label_distribution_similarity(corpus.train, corpus.dev) # 98.57 calculate_label_distribution_similarity(corpus.train, corpus.test) # 73.29 - This is bad, let's investigate our test set more Args: x (List[Example]): Dataset y (List[Example]): Dataset to compare x to Returns: float: Similarity of label distributions """ def pipeline(data: List[Example]) -> Sequence[float]: stats = cast(NERStats, get_ner_stats(data)) sorted_type_counts = get_sorted_type_counts(stats) counts_to_probs = get_probs_from_counts(sorted_type_counts) return counts_to_probs distance = jensenshannon(pipeline(x), pipeline(y)) return (1 - distance) * 100
def best_match_v3(input_title, precalc_vectors: np.ndarray, title_names_pool): input_title = pre_process(input_title) bb = BertEncoder() vec = bb.bert_encoder(input_title) trim_n = 300 #precalc_vectors.shape[1] scores = [] wass_scores = [] jsh_scores = [] for i in range(precalc_vectors.shape[0]): s_d = cosine(vec[:trim_n], precalc_vectors[i][:trim_n]) w_d = wasserstein_distance(vec[:trim_n], precalc_vectors[i][:trim_n]) jsh_d = jensenshannon(np.abs(vec[:trim_n]), np.abs(precalc_vectors[i][:trim_n])) scores.append(s_d) wass_scores.append(w_d) jsh_scores.append(jsh_d) # take top N elements top_n = 3 idxs = np.argpartition(scores, -top_n)[-top_n:] idxs2 = np.argpartition(wass_scores, top_n)[:top_n] idxs3 = np.argpartition(jsh_scores, top_n)[:top_n] title_names_pool = np.array(title_names_pool) scores = np.array(scores) wass_scores = np.array(wass_scores) jsh_scores = np.array(jsh_scores) res = [ x for x in zip(title_names_pool[idxs], scores[idxs], title_names_pool[idxs2], wass_scores[idxs2], title_names_pool[idxs3], jsh_scores[idxs3]) ] print(res) return res
def JSD(P, Q, idx, n_dims, return_dict, reshape=False): if reshape: P, Q = np.reshape(P, (-1, int(P.shape[1] / n_dims), n_dims)), np.reshape( Q, (-1, int(Q.shape[1] / n_dims), n_dims)) return_dict[idx] = [ np.mean([ distance.jensenshannon(P[n, :, m], Q[n, :, m]) for m in np.arange(n_dims) ]) for n in np.arange(idx[0], idx[1]) ] else: return_dict[idx] = [ distance.jensenshannon(P[n, :], Q[n, :]) for n in np.arange(idx[0], idx[1]) ]
def score(self, original_data, synthetic_data, score_dict=False): """Calculate jensen_shannon distance between original and synthetic data. Look for more elaborate evaluation techniques in synthesis.evaluation. Parameters ---------- original_data: pandas.DataFrame Original data that was seen in fit synthetic_data: pandas.DataFrame Synthetic data that was generated based original_data score_dict: bool If true, will return jensen_shannon scores of each column individually Returns ------- average jensen_shannon distance: float Average jensen_shannon distance over all columns per-column jensen_shannon distance (if score_dict): dict Per-column jensen_shannon distance """ column_distances = {} for i, c in enumerate(original_data.columns): # compute value_counts for both original and synthetic - align indexes as certain column # values in the original data may not have been sampled in the synthetic data counts_original, counts_synthetic = \ original_data[c].value_counts(dropna=False).align( synthetic_data[c].value_counts(dropna=False), join='outer', axis=0, fill_value=0 ) js_distance = jensenshannon(counts_original, counts_synthetic) column_distances[c] = js_distance average_column_distance = sum(column_distances.values()) / len(original_data.columns) if score_dict: return average_column_distance, column_distances return average_column_distance
def show_following_distribution(filename, ids_to_plot): user_file = filename artist_db = Database_Instance(user_file) distributions = [] for curr in ids_to_plot.keys(): try: res = [a[1] for a in artist_db.query(''' SELECT id, followings_count FROM follower_data_{} where followings_count < 500 '''.format(curr))] n, bins, patches = plt.hist(res, bins=50) if len(normal) != 0: plt.title('Following Count Distribution for the Followers of Artist {}'.format(str(ids_to_plot[curr]))) plt.ylabel('Frequency') plt.xlabel('Following Count') plt.savefig('./images/BadArtist{}.png'.format(str(ids_to_plot[curr]))) # Calculate Jensen-Hannon Divergence jh = math.sqrt(jensenshannon(normal[0], n)) plt.annotate("JH: " + str(jh)[:8], xy=(0.7, 0.9), xycoords='axes fraction', fontsize=14, color='purple') plt.savefig('./images/JH_GoodArtist{}.png'.format(str(ids_to_plot[curr]))) #plt.show() plt.clf() distributions.append(n) except: print("ID " + str(ids_to_plot) + " given possibly not in DB") return bins, distributions
def jensen_shannon_divergence(p, q, base=2): """ Return the Jensen-Shannon divergence between two 1-D arrays. Parameters --------- p : array left probability array q : array right probability array base : numeric, default 2 logarithm base Returns ------- jsd : float divergence of p and q """ # If the sum of probability array p or q does not equal to 1, then normalize p = np.asarray(p) q = np.asarray(q) p = p / np.sum(p, axis=0) q = q / np.sum(q, axis=0) from scipy.spatial.distance import jensenshannon return round(jensenshannon(p, q, base=base), 4)
def jsd(a, b): for batchid in range(len(a)): for rowid in range(len(a[batchid])): print(a[batchid][rowid][:-1], b[batchid][rowid][:-1]) jsd = distance.jensenshannon(a[batchid][rowid][:-1], b[batchid][rowid][:-1]) print(jsd**2)
def H_metric(H1, H2, mode='chi2'): '''Histogram metrics. H1, H2 -- 1d histograms with matched bins. mode -- Metric to use. Modes: l2 -- Euclidean distance l1 -- Manhattan distance vcos -- Vector cosine distance intersect -- Histogram intersection distance chi2 -- Chi square distance jsd -- Jensen-Shannan Divergence emd -- Earth Mover's Distance ''' if mode == 'l2': return np.linalg.norm(H1 - H2, ord=2) if mode == 'l1': return np.linalg.norm(H1 - H2, ord=1) if mode == 'vcos': return vcos_dist(H1, H2) if mode == 'intersect': return np.sum(np.min(np.stack((H1, H2)), axis=0)) if mode == 'chi2': a = 2 * ((H1 - H2)**2).astype(float) b = H1 + H2 return np.sum(np.divide(a, b, out=np.zeros_like(a), where=b != 0)) if mode == 'jsd': return jensenshannon(H1, H2) if mode == 'emd': return wasserstein_distance(H1, H2) raise NotImplementedError()
def KDE_estimates(positive,negative,njobs): #Implements the KDE estimation procedure separately for the driver (positive) and passenger (negative) data and also calculates the JS distances """ Arguments: positive=The feature matrix for driver mutations negative=The feature matrix for passenger mutations njobs=Number of jobs to run in parallel for the Grid Search Returns: js_t= Jensen-Shannon distance between the estimated densities """ np.random.seed(333) bandwidths = np.logspace(-1, 1, 30) grid_pos = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=3,n_jobs=njobs) grid_pos.fit(positive) kde_pos = KernelDensity(kernel='gaussian', bandwidth = grid_pos.best_params_['bandwidth']).fit(positive) grid_neg = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=3,n_jobs=njobs) grid_neg.fit(negative) kde_neg = KernelDensity(kernel='gaussian', bandwidth = grid_neg.best_params_['bandwidth']).fit(negative) score_pos=kde_pos.score_samples(positive) score_neg=kde_neg.score_samples(negative) js_t=distance.jensenshannon(np.exp(score_pos),np.exp(score_neg)) return js_t
def obj_jensen_shannon(index, fitness, observation, new_pop, envs, args): bins = [i for i in range(-5, 5)] fit_distribution = list() for i in range(len(envs)): fit_distribution.append(fitness[i][index]) fit_distribution = np.array(fit_distribution) if len(fit_distribution) <= 1: return 0 esp = fit_distribution.mean() std = fit_distribution.std() fit_distribution = (fit_distribution - esp) / (std + 1e-8) fit_distribution_de = np.histogram(fit_distribution, bins=bins, density=True)[0] normal_bins = [ norm.cdf(bins[i + 1]) - norm.cdf(bins[i]) for i in range(len(bins) - 1) ] return -jensenshannon(fit_distribution_de, normal_bins, base=2)
def run_summary_operations(x: np.ndarray, y: np.ndarray) -> Dict: kl_div = stats.entropy(x, y) js_div = jensenshannon(x, y, base=2) try: co_int = co_integration(x, y) except MissingDataError: co_int = np.nan covar = covariance(x, y) try: corr_p, _ = correlation(x, y, 'pearson') except ValueError: corr_p = np.nan corr_s, _ = correlation(x, y, 'spearman') corr_k, _ = correlation(x, y, 'kendall') feature_set = \ dict(kl_div=kl_div, js_div=js_div, co_int=co_int, covar=covar, corr_p=corr_p, corr_s=corr_s, corr_k=corr_k) return feature_set
def compute_with_metadata( self, labels: Sequence[Any], preds: Sequence[Any], label_spec: types.LitType, pred_spec: types.LitType, indices: Sequence[types.ExampleId], metas: Sequence[JsonDict], config: Optional[JsonDict] = None) -> Dict[Text, float]: del labels # Unused; we only care about preds. del label_spec # Unused; we only care about preds. ret = collections.OrderedDict() pairs = self.find_pairs(indices, metas) ret['num_pairs'] = len(pairs) if ret['num_pairs'] == 0: return {} pred_idxs = get_classifications(preds, pred_spec, config) # 'swapped' just means the prediction changed. is_swapped = [(pred_idxs[i] == pred_idxs[j]) for i, j in pairs] ret['swap_rate'] = 1 - np.mean(is_swapped) # Jensen-Shannon divergence, as a soft measure of prediction change. jsds = [ scipy_distance.jensenshannon(preds[i], preds[j])**2 for i, j in pairs ] ret['mean_jsd'] = np.mean(jsds) return ret
def jsdivergence(self, *arg, background=BLOSUM62_BG): # calculates jensen shannon divergence of each column to the background; will yield bidirectional equality # if argument is given, compares each column of the 2 alignments, number of columns must be same in both assert all(self.is_position()) if len(arg) > 1: raise Exception( 'Provide only 1 argument to be compared against or none to compare against the BLOSUM62 background' ) elif len(arg) == 1: assert arg[0].shape[1], self.shape[1] b = self.probability(alphabet=background[0])[0].T q = arg[0].probability(alphabet=background[0])[0].T return np.array([jensenshannon(*i) for i in zip(b, q)]) elif len(arg) == 0: p = self.probability(alphabet=background[0])[0].T return np.array([jensenshannon(i, background[1]) for i in p])