def MM_distMat(models): print "Calculating JS_distMat" start = time.time() distMat = np.zeros((models.shape[0],models.shape[0])) nrow = models[0].shape[0] ncol = models[0].shape[1] - 1 # the last column is for marginal distribution for index_a,mat_a in enumerate(models): for index_b,mat_b in enumerate(models[0:index_a]): M = 0.5*(mat_a + mat_b) Dist_a_M = 0 Dist_b_M = 0 for i in range(nrow): if mat_a[i,ncol]>0: Dist_a_M += mat_a[i,ncol]*stats.entropy(mat_a[i,0:ncol],M[i,0:ncol]) # conditioned KL if mat_b[i,ncol]>0: Dist_b_M += mat_b[i,ncol]*stats.entropy(mat_b[i,0:ncol],M[i,0:ncol]) # conditioned KL Dist_a_M += stats.entropy(mat_a[:,ncol],M[:,ncol]) # conditioning KL Dist_b_M += stats.entropy(mat_b[:,ncol],M[:,ncol]) # conditioning KL distMat[index_a,index_b] = np.sqrt(0.5*(Dist_a_M + Dist_b_M)) # according to j-s distMat[index_b,index_a] = distMat[index_a,index_b] # symmetric print "calculated " + str(int((index_a*(index_a+1))*100/(models.shape[0]*(models.shape[0]+1)))) + "% in " + str(int((time.time()-start)/60)) + " minutes" distMat = distMat/distMat.max() print "JS_distMat calculation time is %s minutes " %str(int(time.time()-start)/60) #print "normalization OFF" return distMat
def kullback_leibler(vec1, vec2, num_features=None): """ A distance metric between two probability distributions. Returns a distance value in range <0,1> where values closer to 0 mean less distance (and a higher similarity) Uses the scipy.stats.entropy method to identify kullback_leibler convergence value. If the distribution draws from a certain number of docs, that value must be passed. """ if scipy.sparse.issparse(vec1): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense if num_features != None: # if not None, make as large as the documents drawing from dense1 = sparse2full(vec1, num_features) dense2 = sparse2full(vec2, num_features) return entropy(dense1, dense2) else: max_len = max(len(vec1), len(vec2)) dense1 = sparse2full(vec1, max_len) dense2 = sparse2full(vec2, max_len) return entropy(dense1, dense2) else: # this conversion is made because if it is not in bow format, it might be a list within a list after conversion # the scipy implementation of Kullback fails in such a case so we pick up only the nested list. if len(vec1) == 1: vec1 = vec1[0] if len(vec2) == 1: vec2 = vec2[0] return scipy.stats.entropy(vec1, vec2)
def average_prob(pk, qk=[], val=[]): """It is used to compute the average value of the distribution and the disimilarity to the mean distribution. It could be used to compute how spontaneous is an element to generate bursts. pk: array_like, shape (N,) or shape (N, M) distributions to operate with. qk: array_like, shape (N,) representative distribution or mean distribution. val: list or array_like values of each bin of the distribution. """ # Initial variables m = len(pk.shape) n = pk.shape[0] if m == 1 else pk.shape[1] val = range(n) if val == [] else val qk = 1./n*np.ones(n) if qk == [] else qk # Position value if m == 1: val_avg = np.mean(np.multiply(pk-qk, val)) else: val_avg = np.mean(np.multiply(pk-qk, val), axis=1) # Relative entropy over the average if m == 1: rel_dis = entropy(pk, qk) else: rel_dis = np.array([entropy(pk[i], qk) for i in range(pk.shape[0])]) return val_avg, rel_dis
def entropy_rate(weighted_adj_matrix, stat_dist=None, base=2, print_prefix=''): print(print_prefix + 'calc entropy rate') if stat_dist is None: stat_dist = stationary_dist(weighted_adj_matrix) assert not np.any(stat_dist < 0) assert np.isclose(stat_dist.sum(), 1.) # assert np.all(weighted_adj_matrix.sum(axis=0) > 0) if scipy.sparse.issparse(weighted_adj_matrix): if not isinstance(weighted_adj_matrix, csc_matrix): weighted_adj_matrix = weighted_adj_matrix.tocsc() get_col = weighted_adj_matrix.getcol col_entropy = (get_col(i).data for i in xrange(weighted_adj_matrix.shape[0])) col_entropy = np.array(map(lambda x: stats.entropy(x, base=base), col_entropy)).flatten() else: col_entropy = np.array(stats.entropy(weighted_adj_matrix, base=base)).flatten() stat_dist = np.array(stat_dist).flatten() assert stat_dist.shape == col_entropy.shape col_entropy *= stat_dist finite_elements = np.isfinite(col_entropy) if not all(finite_elements): print(print_prefix + 'WARN: entropy rate contains not finite elements. (inf, nan)') rate = np.sum(col_entropy[finite_elements]) if not np.isfinite(rate): print(print_prefix + 'entropy rate not finite') exit() return rate
def are_imgs_natural(orig_image, image_bounds): """Determines natural images based on the entropy of hue and luminance.""" rgb_to_hsv = np.vectorize(colorsys.rgb_to_hsv) is_natural = [] for bound in image_bounds: image = orig_image.copy() width, height = image.size subimage = image.crop(bound) s_width, s_height = subimage.size arr = np.array(np.asarray(subimage).astype('float')) r, g, b = np.rollaxis(arr, axis=-1) h = rgb_to_hsv(r, g, b)[0] hist_h = np.histogram(h, bins=32, range=(0.0, 1.0)) hist_h = list(hist_h[0]) hist_h = [h/float(sum(hist_h)) for h in hist_h] entropy_h = stats.entropy(hist_h) subimage_l = image.crop(bound).convert('L') hist_l = subimage_l.histogram() hist_l = [h/float(sum(hist_l)) for h in hist_l] entropy_l = stats.entropy(hist_l) # Images larger than a certain size are assumed to be natural images. if (float(s_width)/width < ICON_MAX_SIZE and float(s_height)/height < ICON_MAX_SIZE): is_natural.append(entropy_l > LUMINANCE_THRESHOLD or entropy_h > HUE_THRESHOLD) else: is_natural.append(True) image.close() subimage.close() subimage_l.close() return is_natural
def eval_wrapper(self, seq): """Evaluates current sequence for models based on different modes. Returns boolean of success.""" if self.eval_mode == "None": return True elif self.eval_mode == "Rank Offset": seq_len = len(seq) if seq_len < self.eval_param: return False for name,model in self.models.iteritems(): prob_seq = model.eval_seq(seq[:self.eval_param]) for i in range(self.eval_param): self.eval_output[name][i] += [prob_seq[i]] elif self.eval_mode == "Kullback-Leibler": for name,model in self.models.iteritems(): #NOTE: KL doesn't deal well with 0s so need to perturb by a very small number to avoid NaNs perturb_obs = [1e-15]*model.num_obs perturb_trans = [1e-15]*model.num_states # observation matrix out = 0 for i in range(1,model.num_states): #normalized across rows, skip start state since no obs out+=stats.entropy(model.obs[i,:],self.eval_param['obs'][i,:] + perturb_obs)/model.num_states self.eval_output[name]['obs'].append(out) # transition matrix out = 0 for i in range(model.num_states-1): #normalized across rows, skip end state since no transitions out+=stats.entropy(model.trans[i,:],self.eval_param['trans'][i,:] + perturb_trans)/model.num_states self.eval_output[name]['trans'].append(out) return True
def Jensen_Shannon_divergence(list_p, list_w=[]): """Compute the Jensen-Shannon divergence generically. Parameters ---------- list_p: list, array_like the list of probability distributions. list_w: list the list of weights for each probability distribution. Returns ------- div: float JS divergence value. """ # Check and format inputs assert(len(list_p) > 1) list_w = [1/2., 1/2.] if list_w == [] else list_w w = np.array(list_w)[np.newaxis] probs = np.array(list_p) if type(list_p) == list else list_p # Compute measure div = entropy(np.sum(np.multiply(w.T, probs))) -\ np.sum(np.multiply(w, entropy(probs.T))) return div
def dist(ft1, ft2, method='bh'): """ Computes the distance between two sets of histogram features. Args: ft1, ft2: numpy.ndarray (vector) histograms as returned by compute() method: string the method used for computing the distance between the two sets of features: 'kl' - Kullback-Leibler divergence (symmetrized by 0.5*(KL(p,q)+KL(q,p)) 'js' - Jensen-Shannon divergence: 0.5*(KL(p,m)+KL(q,m)) where m=(p+q)/2 'bh' - Bhattacharyya distance: -log(sqrt(sum_i (p_i*q_i))) 'ma' - Matusita distance: sqrt(sum_i (sqrt(p_i)-sqrt(q_i))**2) """ # distance methods dm = {'kl': lambda x_, y_: 0.5 * (entropy(x_, y_) + entropy(y_, x_)), 'js': lambda x_, y_: 0.5 * (entropy(x_, 0.5 * (x_ + y_)) + entropy(y_, 0.5 * (x_ + y_))), 'bh': lambda x_, y_: -np.log(np.sum(np.sqrt(x_ * y_))), 'ma': lambda x_, y_: np.sqrt(np.sum((np.sqrt(x_) - np.sqrt(y_)) ** 2)) } method = method.lower() if method not in dm.keys(): raise ValueError('Unknown method') return dm[method](ft1, ft2)
def plot_KL(data): """Kullback-Leibler divergence, given a Dataset object. The 'true' distribution is the data one""" frequencies = data.frequencies Ncat = data.Ncat fiducial = data.generate_mc(100) sh, loc, sc = data.lognorm_par() freq_ln = [np.sort(stats.lognorm.rvs(sh, scale=sc, size=Ncat, random_state=s))[::-1] for s in range(1, 1001)] kl_ln = [stats.entropy(frequencies, r) for r in freq_ln] lengths = [min(Ncat, len(mc)) for mc in fiducial] # Cut to the minimum Ncat kl_data = [stats.entropy(frequencies[:lengths[i]], mc[:lengths[i]]) for i, mc in enumerate(fiducial)] # Plot KL divergence. Use kdeplot instead of histogram fig = plt.figure(figsize=[10, 6.18]) plt.title('Kullback-Leibler divergence') # plt.hist(kl_data, bins=10, normed=True, label='MC', alpha=0.5) # plt.hist(kl_ln, bins=10, normed=True, label='Lognormal', alpha=0.5, color='Blue') sns.kdeplot(np.array(kl_data), label='MC', alpha=0.6, color='Blue') sns.kdeplot(np.array(kl_ln), label='Lognormal', alpha=0.6, color='Orange') plt.xlim(xmin=0.) # plt.axvline(ks_tree[0], c='Purple', label = 'Tree model') # plt.axvline(kl_ln, c='Orange', label = 'Lognormal') plt.legend(loc='best') # plt.savefig(os.path.join('all_data', 'KL_'+data.name+'.png')) return
def jensen_shannon(vec1, vec2, num_features=None): """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`. Parameters ---------- vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)} Distribution vector. vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} Distribution vector. num_features : int, optional Number of features in vector. Returns ------- float Jensen-Shannon distance between `vec1` and `vec2`. Notes ----- This is symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`. """ vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features) avg_vec = 0.5 * (vec1 + vec2) return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
def distFunc(ys,xs): ''' # Calculate distance between two empirical distributions. # input parameters for model. # output: distance between generated data and data xs. ''' if (np.sum(ys)==0): return np.inf else: if xs.ndim == 1: kernely = stats.gaussian_kde(ys) kernelx = stats.gaussian_kde(xs) xx = np.linspace(np.min(xs),np.max(xs)) #range over data. return stats.entropy(kernelx(xx),qk=kernely(xx)) #KL-divergence. else: #dimensions are (npoints,nparams) to keep consistent with sci-kit #learn kernely = stats.gaussian_kde(ys.T) kernelx = stats.gaussian_kde(xs.T) #range over n-dimensional data (npoints,nparams) mesh = [np.linspace(np.min(xs[:,i]),np.max(xs[:,i])) for i in range(xs.shape[1])] xx = np.meshgrid(*mesh) xx = np.array([x.ravel() for x in xx]).T return stats.entropy(kernelx(xx.T),qk=kernely(xx.T)) #KL-divergence.
def sj_divergence(p, q): """ Compute the Shannon-Jensen divergence, a symmetric KL-divergence. :param p: a probability distribution :param q: another probability distribution :return: SJ divergence measure """ return 0.5 * (entropy(p, q) + entropy(q, p))
def jensen_shannon(vec1, vec2, num_features=None): """ A method of measuring the similarity between two probability distributions. It is a symmetrized and finite version of the Kullback–Leibler divergence. """ vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features) avg_vec = 0.5 * (vec1 + vec2) return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
def graph_JSD(src,edge,dst): P = src['X1'] Q = dst['X1'] _P = P / norm(P, ord=1) _Q = Q / norm(Q, ord=1) _M = 0.5 * (_P + _Q) edge['distance'] = 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) return (src, edge, dst)
def test_entropy_positive(self): """See ticket #497""" pk = [0.5, 0.2, 0.3] qk = [0.1, 0.25, 0.65] eself = stats.entropy(pk, pk) edouble = stats.entropy(pk, qk) assert 0.0 == eself assert edouble >= 0.0
def test_entropy_positive(self): # See ticket #497 pk = [0.5,0.2,0.3] qk = [0.1,0.25,0.65] eself = stats.entropy(pk,pk) edouble = stats.entropy(pk,qk) assert_(0.0 == eself) assert_(edouble >= 0.0)
def entropy(X, axis=0): if axis==1: Y = [stats.entropy(x) for x in X] elif axis==0: Y = [stats.entropy(x) for x in X.T] else: raise ValueError('Error: Choose axis=0 or axis=1') return np.array(Y)
def norm_entropy(count_list): """compute entropy from coverage""" count_vector = np.array(count_list) prob_vector = count_vector / float(count_vector.sum()) prob_uniform = np.array([1.0/len(prob_vector)] * len(prob_vector)) H_norm = entropy(prob_vector) / entropy(prob_uniform) return H_norm
def done(self): import numpy as np vals = np.array(self.vals) / np.sum(self.vals) from scipy.stats import entropy if args.pad is None or args.pad <= len(vals): e = entropy(vals, base = args.base) else: e = entropy(np.append(vals, [0.0] * (args.pad - len(vals))), base = args.base) args.outfile.write(self.tup + [e])
def calc_entropy(AT, sigma=None, weights=None, known_nodes=None, entropy_base=2): if weights is not None: weights = np.array(weights) if weights.sum() != 1.0: weights /= weights.sum() if sigma is None: entropy = stats.entropy(AT.T, base=entropy_base) if weights is not None: entropy *= weights return entropy.mean() selection_range = set(range(AT.shape[0])) # print 'sigma:\n', sigma if known_nodes is not None: ones_mat = np.ones(AT.shape, dtype=np.float) # sigma *= sigma_in.min() # print 'known nodes:', known_nodes #print 'orig sigma\n', sigma for v in map(int, known_nodes): ones_mat[v, :] = sigma[v, :] ones_mat[:, v] = sigma[:, v] sigma = ones_mat #sigma[0, 2] = 100 #print 'sigma:\n', sigma #print 'max:', sigma.max(axis=1).reshape(sigma.shape[1], 1) sigma /= sigma.mean(axis=1).reshape(sigma.shape[1], 1) #print 'norm sigma:\n', sigma #print 'mean:', sigma.mean(axis=1) # print 'sigma:\n', sigma total_entropy = 0 for v in selection_range: # exclude loop current_selection = list(selection_range - {v}) # stack the katz row of the target vertex N-1 times #print sigma row_sigma = sigma[v, :] #print row_sigma #print 'AT\n',AT #print '@:',current_selection #print AT[:,current_selection] # multiply katz with transposed AT -> only katz values on real links res = np.multiply(row_sigma, AT[current_selection, :]) #print res # calc entropy per row and add it to the overall entropy ent = stats.entropy(res.T, base=entropy_base) if weights is not None: ent *= weights[current_selection] #print ent total_entropy += ent.sum() num_v = AT.shape[0] total_entropy /= (num_v * (num_v - 1)) print 'total entropy:', total_entropy #print 'total entropy:', total_entropy if known_nodes is not None: return total_entropy, int(len(known_nodes) / AT.shape[0] * 100) return total_entropy
def KLDivergenceSim(a,b,topics): from scipy.stats import entropy import math a = fill_list_from_dict(a,topics) b = fill_list_from_dict(b,topics) entropyOf_A_to_B = entropy(a,b) entropyOf_B_to_A = entropy(b,a) minusSummedEntropy = -(entropyOf_A_to_B+entropyOf_B_to_A) return math.exp(minusSummedEntropy)
def _get_max_entropy(options): if 'max_entropy' not in Config.RESTRICTIONS['diversity']: pdf = DiversityMaintenance._pdf([0], options) other_pdf = DiversityMaintenance._pdf([2], options) e1 = stats.entropy(pdf, other_pdf) e2 = stats.entropy(other_pdf, pdf) total = e1+e2 Config.RESTRICTIONS['diversity']['max_entropy'] = total return Config.RESTRICTIONS['diversity']['max_entropy']
def make_nb_plot(self): self._get_nb_estimate() p = self.nb_prob n = self.nb_size x = np.arange(0, 100) pmf = nbinom.pmf(x, n, p) self.line_neg_binomial, = plt.plot(x, pmf, ls=":", linewidth=2) self.nb_real_kl = entropy(self.probs, pmf) self.nb_grammar_kl = entropy(self.b_prob[:100], pmf[:100])
def test_entropy_base(self): pk = np.ones(16, float) S = stats.entropy(pk, base=2.) assert_(abs(S - 4.) < 1.e-5) qk = np.ones(16, float) qk[:8] = 2. S = stats.entropy(pk, qk) S2 = stats.entropy(pk, qk, base=2.) assert_(abs(S/S2 - np.log(2.)) < 1.e-5)
def approximate_mixture_data(): num_loc_proposals = 2 num_imp_samp = 1000 n_comp = 2 p_comp = np.array([0.7, 0.3]) dim = 1 num_obs = 100 obs = None means = [] for i in range(n_comp): means.append([20*i]*dim) if obs is None: obs = dist.mvt(means[-1], np.eye(dim),30).rvs(np.int(np.round(num_obs*p_comp[i]))) else: obs = np.vstack([obs, dist.mvt(means[-1], np.eye(dim),30).rvs(np.int(np.round(num_obs*p_comp[i])))]) count = {"local_lpost" :0, "local_llhood" :0, "naive_lpost" :0 ,"naive_llhood" :0,"standard_lpost" :0 ,"standard_llhood" :0} print(means) #return def count_closure(name): def rval(): count[name] = count[name] + 1 return rval initial_samples = [] for _ in range(10): initial_samples.append(DirCatTMM(obs, [1]*n_comp, dist.mvt(np.mean(means,0), np.eye(dim)*5, dim), dist.invwishart(np.eye(dim) * 5, dim+1 ), stats.gamma(1,scale=1))) # (naive_samp, naive_lpost) = pmc.sample(num_imp_samp, initial_samples, # DirCatTMMProposal(naive_multi_proposals = num_loc_proposals, # lpost_count = count_closure("naive_lpost"), # llhood_count = count_closure("naive_llhood")), # population_size = 4) (infl_samp, infl_lpost) = pmc.sample(num_imp_samp, initial_samples, DirCatTMMProposal(num_local_proposals = num_loc_proposals, lpost_count = count_closure("local_lpost"), llhood_count = count_closure("local_llhood")), population_size = 4) (stand_samp, stand_lpost) = pmc.sample(num_imp_samp * num_loc_proposals, initial_samples, DirCatTMMProposal(lpost_count = count_closure("standard_lpost"), llhood_count = count_closure("standard_llhood")), population_size = 4) print("===============\n",p_comp, means, # "\n\n--NAIVE--\n", # naive_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, naive_samp[-1].comp_indic.sum(0))+1, count["naive_llhood"], count["naive_lpost"], "\n\n--LOCAL--\n", infl_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, infl_samp[-1].comp_indic.sum(0))+1, count["local_llhood"], count["local_lpost"], "\n\n--STANDARD--\n", stand_samp[-1].comp_indic.sum(0), stats.entropy(p_comp, stand_samp[-1].comp_indic.sum(0))+1, count["standard_llhood"], count["standard_lpost"],"\n\n") return {"infl":(infl_samp, infl_lpost), "standard":(stand_samp, stand_lpost)}
def JSD(P, Q): """ Calculates the Jensen-Shannon divergence as a metric (sq_root) See: http://www.researchgate.net/publication/3084774_A_new_metric_for_probability_distributions """ _P = P / norm(P, ord=1) _Q = Q / norm(Q, ord=1) _M = 0.5 * (_P + _Q) return math.sqrt(0.5 * (entropy(_P, _M) + entropy(_Q, _M)))
def kl_distance(pk,qk=None): ''' One has to remember that KL divergence is done for two probability distributions and not a data series, so first get the normalized histogram to feed into this. One can also first estimate the distribution and then feed it to get the distribution. ''' if qk != None: min_len = min(len(pk),len(qk)) return entropy(pk[:min_len], qk[:min_len],base=2) else: return entropy(pk, qk,base=2)
def make_lognorm_plot(self): #mu, sigma = self.m_log, self.s_log #sigma, _, mu = lognorm.fit(self.len_list, floc=0) mu, sigma = norm.fit(np.log(self.len_list)) #mu, sigma = 3.45, 0.45 print mu, sigma x = np.linspace(1, 100, 100) pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2)) / (x * sigma * np.sqrt(2 * np.pi))) self.line_lognorm, = plt.plot(x, pdf, linewidth=2, color='r') self.lognorm_real_kl = entropy(self.probs, pdf) self.lognorm_grammar_kl = entropy(self.b_prob[:100], pdf[:100])
def MM_distMat(models): distMat = np.zeros((models.shape[0],models.shape[0])) nrow = models[0].shape[0] ncol = models[0].shape[1] - 1 # the last column is for marginal distribution for index_a,mat_a in enumerate(models): print "calculating row " + str(index_a) + " out of " + str(models.shape[0]) print "time passed so far: " + str(int((time.time()-start)/60)) + " minutes" for index_b,mat_b in enumerate(models): distMat[index_a,index_b] = sum(mat_a[i,ncol]*stats.entropy(mat_a[i,0:ncol],mat_b[i,0:ncol]) for i in range(nrow)) # conditioned KL distMat[index_a,index_b] += stats.entropy(mat_a[:,ncol],mat_b[:,ncol]) # conditioning KL distMat = distMat/distMat.max() return distMat
def get_information_gain(df): values = df["attr"].unique() entropy_values = [] for value in values: target = df[df["attr"] == value]["target"] positive_cases = sum(target) negative_cases = len(target) - sum(target) entropy = stats.entropy([positive_cases, negative_cases], base=2) weighted_entropy = len(df[df["attr"] == value])/len(df["attr"]) * entropy entropy_values.append(weighted_entropy) return stats.entropy([sum(df["target"]), len(df["target"]) - sum(df["target"])], base=2) - sum(entropy_values)
def empirical_entropy(dist, bins=10): # Histogram of counts across the bins hist_counts, _ = np.histogramdd(dist, bins=bins, range=[(0, 1) for _ in dist[0]]) return entropy(hist_counts.flatten()) # Scipy will normalise
def pd_entropy(s): distribution = s.value_counts(normalize=True, dropna=self.dropna) return stats.entropy(distribution, base=self.base)
if opts.compute_CIS: cur_preds.append(pred) # path = os.path.join(opts.output_folder, 'input{:03d}_output{:03d}.jpg'.format(i, j)) basename = os.path.basename(names[1]) path = os.path.join(opts.output_folder + "_%02d" % j, basename) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) vutils.save_image(outputs.data, path, padding=0, normalize=True) if opts.compute_CIS: cur_preds = np.concatenate(cur_preds, 0) py = np.sum( cur_preds, axis=0 ) # prior is computed from outputs given a specific input for j in range(cur_preds.shape[0]): pyx = cur_preds[j, :] CIS.append(entropy(pyx, py)) if not opts.output_only: # also save input images vutils.save_image(images.data, os.path.join(opts.output_folder, 'input{:03d}.jpg'.format(i)), padding=0, normalize=True) if opts.compute_IS: all_preds = np.concatenate(all_preds, 0) py = np.sum(all_preds, axis=0) # prior is computed from all outputs for j in range(all_preds.shape[0]): pyx = all_preds[j, :] IS.append(entropy(pyx, py)) if opts.compute_IS:
ax.set_xlabel(r'$\alpha$') ax.set_ylabel('test accuracy (%)') ax.set_title('Effect of different Dirichlet priors on test accuracy for naive bayes document classification') ## format for less visual cluster ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() plt.savefig('img/hw2_-_alpha,test_accuracy.png', format='png') # plt.show() ## feature selection size_vocab = len(df_vocab) alpha = 1./size_vocab p_y, p_x_given_y_tab = train_naive_bayes(df_train_data, df_train_labels, df_vocab, df_ng_labels, alpha) ## calc p(x) = \sum_y p(x,y) = \sum_y p(x|y)p(y) p_xy = np.power(10, p_x_given_y_tab).mul(p_y['p']) p_x = p_xy.sum(axis=1) i_x = -1.*np.log2(p_x) # information content of a particular word (not an ensemble) (aka. self-information) h_x_given_Y = np.power(10, p_x_given_y_tab).apply(lambda r: entropy(r, base=2), axis=1) # note that H(X|Y=y) = \sum_{x \in X} p(x|y) \log_2 p(x|y), but I did \sum_{y \in Y} p(x|y) \log_2 p(x|y), summing over the condition (labels) rather than summing over possible outcomes (words) ## why does "*" work? see notes tmp_df_res = (i_x * h_x_given_Y).to_frame(name='metric').join(df_vocab).sort('metric', ascending=True).join(np.log10(p_x).to_frame('log_p_x')) ## print out the top 100 words pd.set_option('display.max_rows', None) print tmp_df_res.iloc[:100]
def get_uniform_entropy(self): uniform_probability = 1.0 / len(self.diagnoses) return entropy( list(map(lambda diag: uniform_probability, self.diagnoses)))
def distOf2Dist(dist1, dist2): p = dist1.values() q = dist2.values() d1 = entropy(p, q) d2 = entropy(q, p) return (d1 + d2) / 2
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df = orig_df # Get observed frameshift rates obs_fs = {'+0': 0, '+1': 0, '+2': 0} all_ins_lens = set(df[df['Category'].isin(['ins', 'ins_notatcut'])]['Length']) for ins_len in all_ins_lens: crit = (df['Category'].isin(['ins', 'ins_notatcut'])) & (df['Length'] == ins_len) fs = ins_len % 3 count = sum(df[crit]['Count']) key = '+%s' % (int(fs)) obs_fs[key] += count all_del_lens = set(df[df['Category'].isin(['del', 'del_notatcut'])]['Length']) for del_len in all_del_lens: crit = (df['Category'].isin(['del', 'del_notatcut'])) & (df['Length'] == del_len) fs = (-1 * del_len) % 3 count = sum(df[crit]['Count']) key = '+%s' % (int(fs)) obs_fs[key] += count tot = sum(obs_fs.values()) for key in obs_fs: obs_fs[key] /= tot # Predict _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp frequency freq = rate_1bpins / (1 - rate_1bpins) pred = list(dlpred) pred.insert(0, freq) pred = np.array(pred) / sum(pred) pred_fs = {'+0': 0, '+1': 0, '+2': 0} pred_fs['+1'] += pred[0] for idx in range(1, len(pred)): del_len = idx fs = (-1 * del_len) % 3 key = '+%s' % (int(fs)) pred_fs[key] += pred[idx] # Bae predict bae_fs = {'+0': 0, '+1': 0, '+2': 0} bae_dlpred = bae_prediction(seq, cutsite) for idx in range(len(bae_dlpred)): del_len = idx + 1 fs = (-1 * del_len) % 3 key = '+%s' % (int(fs)) bae_fs[key] += bae_dlpred[idx] for fs in ['+0', '+1', '+2']: alldf_dict['Frame'].append(fs) alldf_dict['Bae'].append(bae_fs[fs]) alldf_dict['inDelphi'].append(pred_fs[fs]) alldf_dict['Obs'].append(obs_fs[fs]) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def get_inception_score(imgs, cuda=True, batch_size=32, resize=False, splits=1): """ Computes the inception score of the generated images imgs imgs -- Torch dataset of (3xHxW) numpy images normalized in the range [-1, 1] cuda -- whether or not to run on GPU batch_size -- batch size for feeding into Inception v3 splits -- number of splits """ N = len(imgs) assert batch_size > 0 assert N > batch_size # Set up dtype if cuda: dtype = torch.cuda.FloatTensor else: if torch.cuda.is_available(): print( "WARNING: You have a CUDA device, so you should probably set cuda=True" ) dtype = torch.FloatTensor # Set up dataloader dataloader = torch.utils.data.DataLoader(imgs, batch_size=batch_size) # Load inception model inception_model = inception_v3(pretrained=True, transform_input=False).type(dtype) inception_model.eval() up = nn.Upsample(size=(299, 299), mode='bilinear').type(dtype) def get_pred(x): if resize: x = up(x) x = inception_model(x) return F.softmax(x).data.cpu().numpy() # Get predictions preds = np.zeros((N, 1000)) for i, batch in enumerate(dataloader, 0): batch = batch.type(dtype) batchv = Variable(batch) batch_size_i = batch.size()[0] preds[i * batch_size:i * batch_size + batch_size_i] = get_pred(batchv) # Now compute the mean kl-div split_scores = [] for k in range(splits): part = preds[k * (N // splits):(k + 1) * (N // splits), :] py = np.mean(part, axis=0) scores = [] for i in range(part.shape[0]): pyx = part[i, :] scores.append(entropy(pyx, py)) split_scores.append(np.exp(np.mean(scores))) return np.mean(split_scores), np.std(split_scores)
def agreement(arr): return 1 - entropy(arr, base=2) / np.log2(len(arr))
def get_llrs(X, y, features, label=1, binary=True, debug_word_set=None): ''' Computes Log-Likelihood Ratio (LLR) for the input features. Parameters: ----------- X: Numpy matrix (m x n) Consisting of training instances as rows and features as columns y: numpy matrix The labels of the instances label: int (Default 1) consider this label as positive label and compute LLR. This value should be one of the values in y binary: boolean (Default True) If true then features are counted as binary. ie either the feature occurs in that document or not. Term freq within the document is ignored. debug_word_set: set() Prints the debug information if the feature is in this set Returns: ----------- list of tuples of the form [(a,b), ...] where a: the feature, b: the llr score of the feature Usage: ----------- X = count_vect.transform(data.get_texts()) print('feat_X.shape %d,%d' % X.shape) y = data.target features = np.array(count_vect.get_feature_names()) llrs = get_llrs(X, y, features, label=1, binary=True) features = np.array([item[0] for item in llrs]) weights = np.array([item[1] for item in llrs]) ''' if binary: nc = (y == label).sum() nc_ = (y != label).sum() X = binarize(X) else: nc = X[y == label].sum() nc_ = X[y != label].sum() print('nc, nc_ = %d,%d' % (nc, nc_)) counts = X[y == label, :] counts_ = X[y != label, :] k1h = counts.sum(axis=0) + 1 k2h = counts_.sum(axis=0) + 1 k1t = nc - k1h + 1 k2t = nc_ - k2h + 1 llrs = [] for i, word in enumerate(features): if debug_word_set is not None and word not in words: continue mat = np.matrix([[k1h[0, i], k1t[0, i]], [k2h[0, i], k2t[0, i]]]) if mat[0, 0] == 1: continue # llr = 2 * mat.sum() * (-entropy(mat.A1) - -entropy(mat.sum(axis=1)) - -entropy(mat.T.sum(axis=1))) Hmat = entropy(mat.A1) # entropy of matrix Hrow = entropy(mat.sum(axis=1))[0] # entropy of row sums Hcol = entropy(mat.T.sum(axis=1))[0] # entropy of col sums llr = -2 * mat.sum() * (Hmat - Hrow - Hcol) llrs.append((word, llr)) if debug_word_set is not None and word in debug_word_set: print(word) print(mat) print(mat.sum(), Hmat, Hrow, Hcol) print(llr) return llrs
def add_computed_columns(self): # probs and entropy self.df['prob_final_tote_odds'] = self.df.groupby( 'race_id')['final_tote_odds'].transform(compute_probs_from_odds) self.df['entropy_final_tote_odds'] = self.df.groupby( 'race_id')['prob_final_tote_odds'].transform( lambda x: entropy(x, base=len(x))) self.df['entropy_final_tote_odds'] = self.df[ 'entropy_final_tote_odds'].map(lambda x: nan if isneginf(x) else x) self.df['prob_morning_line_odds'] = self.df.groupby( 'race_id')['morning_line'].transform(compute_probs_from_odds) self.df['rank_prob_morning_line_odds'] = self.df.groupby( 'race_id')['prob_morning_line_odds'].rank(ascending=False) self.df['entropy_morning_line_odds'] = self.df.groupby( 'race_id')['prob_morning_line_odds'].transform( lambda x: entropy(x, base=len(x))) self.df['entropy_morning_line_odds'] = self.df[ 'entropy_morning_line_odds'].map(lambda x: nan if isneginf(x) else x) self.df[ 'num_effective_starters_morning_line'] = self.df.entropy_morning_line_odds * self.df.num_starters_post self.df[ 'num_effective_starters_final_tote_odds'] = self.df.entropy_final_tote_odds * self.df.num_starters_post #self.df['drop_morning_line_odds'] = (self.df['num_starters_post'] - self.df['num_effective_starters_morning_line']).map(round) self.df['diff_logprob_final_tote_morning_line'] = self.df[ 'prob_final_tote_odds'].map(lambda x: math.log(x)) - self.df[ 'prob_morning_line_odds'].map(lambda x: math.log(x)) self.df['rank_prob_final_tote_odds'] = self.df.groupby( 'race_id')['prob_final_tote_odds'].rank(ascending=False) self.df['rank_diff_logprob_final_tote_morning_line'] = self.df.groupby( 'race_id')['diff_logprob_final_tote_morning_line'].transform( lambda x: x.rank(ascending=False)) # sprint if 1759 yards (1 mile) or less, route if more self.df['is_route'] = self.df['distance'].map(lambda x: int(x > 1759)) self.df['num_starters_post'] = self.df['num_starters_post'].map( lambda x: int(x)) self.df['cost_exacta_from_win_show'] = self.df[ 'num_starters_post'].map(lambda x: (x - 1) * 1) self.df['cost_trifecta_from_place_wc'] = self.df[ 'num_starters_post'].map(lambda x: (x - 1) * (x - 2) * 2) self.df['cost_superfecta_from_show_a1'] = self.df[ 'num_starters_post'].map(lambda x: (x - 1) * (x - 2) * (x - 3) * 3) self.df['cost_synth_place_tri'] = self.df['num_starters_post'].map( lambda x: (x - 1) * (x - 2) * 2) #self.df['cost_synth_'] = self.df['num_starters_post'].map(lambda x: (x - 1) * (x - 2) * 2) self.df['log_ratio_effectivestarters_morningline'] = -1.0 * log( self.df.num_effective_starters_morning_line / self.df.num_starters_post) self.df['max_prob_morning_line_odds'] = self.df.groupby( 'race_id')['prob_morning_line_odds'].transform(lambda x: x.max()) self.df['max_prob_final_tote_odds'] = self.df.groupby( 'race_id')['prob_final_tote_odds'].transform(lambda x: x.max()) self.df['underperformance_weighted'] = ( self.df['rank_prob_final_tote_odds'] - self.df['official_finish_position'] ) * self.df['prob_final_tote_odds']
def Entropy2(labels, base=2): data = labels.value_counts() en = stats.entropy(data, base = base)
def kmeans(self, s, winbids, losebids): if len(s) == 0: return 0 leafSize = self.LEAF_SIZE s1 = {} s2 = {} winbids1 = {} winbids2 = {} losebids1 = {} losebids2 = {} len1 = 0 len2 = 0 lenk = {} # random split s into s1 and s2, and calculate minPrice,maxPrice for i in range(int(len(s) / 2)): k = list(s.keys())[i] s1[k] = s[k] winbids1[k] = winbids[k] losebids1[k] = losebids[k] lenk[k] = sum(s[k]) len1 += lenk[k] for i in range(int(len(s) / 2), len(s)): k = list(s.keys())[i] s2[k] = s[k] winbids2[k] = winbids[k] losebids2[k] = losebids[k] lenk[k] = sum(s[k]) len2 += lenk[k] # EM-step KLD1 = 0.0 KLD2 = 0.0 KLD = 0.0 pr = [] count = 0 isBreak = 0 not_converged = 1 while not_converged: count += 1 # begin not_converged = 0 # E-step: q1 = self.calProbDistribution(winbids1, losebids1) q2 = self.calProbDistribution(winbids2, losebids2) KLD = entropy(q1, q2) if count > 8 and KLD < KLD1: isBreak = 1 if count > 3 and KLD < KLD1 and KLD == KLD2: isBreak = 1 KLD2 = KLD1 KLD1 = KLD # M-step: for k in s.keys(): mk = self.calProbDistribution({k: winbids[k]}, {k: losebids[k]}) k1 = entropy(mk, q1) k2 = entropy(mk, q2) if k1 < k2: if k in s1: continue if len2 - lenk[k] < leafSize: continue not_converged = 1 s1[k] = s[k] winbids1[k] = winbids[k] losebids1[k] = losebids[k] len1 += lenk[k] if k in s2: len2 -= lenk[k] s2.pop(k) winbids2.pop(k) losebids2.pop(k) elif k1 > k2: if k in s2: continue if len1 - lenk[k] < leafSize: continue not_converged = 1 s2[k] = s[k] winbids2[k] = winbids[k] losebids2[k] = losebids[k] len2 += lenk[k] if k in s1: len1 -= lenk[k] s1.pop(k) winbids1.pop(k) losebids1.pop(k) if isBreak == 1: break return s1, s2, winbids1, winbids2, losebids1, losebids2
def entropy1(labels, base=2): value, counts = np.unique(labels, return_counts=True) return entropy(counts, base=2)
def Entropy(input_vec): value, counts = np.unique(input_vec, return_counts=True) entropy_val = entropy(counts, base=2) return entropy_val
for i in range(51): prob_node_send_SYN.append(node_send_SYN[i]/total_SYN) prob_node_recv_SYN.append(node_recv_SYN[i]/total_SYN) prob_node_send_ENC.append(node_send_ENC[i]/total_ENC) prob_node_recv_ENC.append(node_recv_ENC[i]/total_ENC) #prob_node_send_SYN --- probability of a node sending a SYN #prob_node_recv_SYN --- probability of a node receiving a SYN. #prob_node_send_ENC #prob_node_recv_ENC return prob_node_send_SYN, prob_node_recv_SYN, prob_node_send_ENC, prob_node_recv_ENC ## Now, to calculate KL divergence, calculate the same probabilities as above for all test traces. if __name__ == "__main__": f = open("normal.tr","r") p_send_SYN, p_recv_SYN, p_send_ENC, p_recv_ENC = probabilities(f) print("Entropies for normal trace file:\n") print("Send SYN:",entropy(p_send_SYN)/math.log(51)) print("Send ENC:",entropy(p_send_ENC)/math.log(51)) for i in range (1,4): fname = "trace"+str(i)+".tr" f1 = open(fname, "r") print("TEST trace: ",fname) t_send_SYN, t_recv_SYN, t_send_ENC, t_recv_ENC = probabilities(f1) print("KL divergence of send SYN:", entropy(p_send_SYN,t_send_SYN, base=10)) print("KL divergence of send ENC:", entropy(p_send_ENC, t_send_ENC, base=10))
def _get_bugs_scores(self): bugs = self.get_bugs() comps_prob = dict(self.get_components_probabilities()) bugs_prob = list(map(lambda x: comps_prob.get(x, 0), bugs)) return np.mean(bugs_prob), np.std(bugs_prob), entropy(bugs_prob)
def l_diversity(df_train, group_grid_list): grid_dtr = np.zeros(len(group_grid_list)) for i in range(len(group_grid_list)): grid_dtr[i] = df_train.loc[df_train['grid'] == group_grid_list[i]].shape[0] grid_dtr_norm = grid_dtr / norm(grid_dtr, ord=1) return entropy(grid_dtr_norm)
def calc_component_entropy(self): return entropy( list(map(lambda x: x[1], self.get_components_probabilities())))
def correction_experiment(dataset_name=None, tweak_train=None, p_P=None, tweak_test=None, p_Q=None, num_train_samples=None, num_val_samples=None, num_test_samples=None, num_hidden=None, epochs=None, batch_size=None): # set the context for compute ctx = mx.gpu() # set the context for data data_ctx = mx.gpu() # load the dataset X, y, Xtest, ytest = load_data(dataset_name) n = X.shape[0] dfeat = np.prod(X.shape[1:]) # NOTE FOR IMPROVEMENT: eventually this should be returned by the data library num_labels = 10 ################################################ # Random permutation of the data ################################################ rand_idx = np.random.permutation(n) X = X[rand_idx,...] y = y[rand_idx] ################################################ # First split examples between train and validation ################################################ num = 2 Xtrain_source = X[:(n//num),:,:,:] ytrain_source = y[:(n//num)] Xval_source = X[(n//num):(2*n//num),:,:,:] yval_source = y[(n//num):(2*n//num):] ################################################ # Set the label distribution at train time ################################################ if tweak_train: # print("Sampling training and validation data from p_P") # print("Current p_P: ", p_P) Xtrain, ytrain = tweak_dist(Xtrain_source, ytrain_source, num_labels, num_train_samples, p_P) Xval, yval = tweak_dist(Xval_source, yval_source, num_labels, num_val_samples, p_P) else: Xtrain, ytrain = Xtrain_source, ytrain_source Xval, yval = Xval_source, yval_source ################################################ # Set the label distribution for test data ################################################ if tweak_test: # print("Sampling test data from p_Q") # print("Current p_Q: ", p_Q) Xtest, ytest = tweak_dist(Xtest, ytest, num_labels, num_test_samples, p_Q) #################################### # Train on p_P #################################### net = gluon.nn.HybridSequential() with net.name_scope(): net.add(gluon.nn.Dense(num_hidden, activation="relu")) net.add(gluon.nn.Dense(num_hidden, activation="relu")) net.add(gluon.nn.Dense(num_labels)) net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1}) net.hybridize() # Training weighted_train(net, softmax_cross_entropy, trainer, Xtrain, ytrain, Xval, yval, ctx, dfeat, epoch=epochs, weightfunc=None, data_ctx=data_ctx) # Prediction ypred_s, ypred_s_soft = predict_all(Xval, net, ctx, dfeat) ypred_t, ypred_t_soft = predict_all(Xtest, net, ctx, dfeat) # Converting to numpy array for later convenience ypred_s= ypred_s.asnumpy() ypred_s_soft = ypred_s_soft.asnumpy() ypred_t = ypred_t.asnumpy() ypred_t_soft = ypred_t_soft.asnumpy() #################################### # Estimate Wt and Py #################################### wt = estimate_labelshift_ratio(yval, ypred_s, ypred_t,num_labels) Py_est = estimate_target_dist(wt, yval,num_labels) Py_true = calculate_marginal(ytest,num_labels) Py_base = calculate_marginal(yval,num_labels) wt_true = Py_true/Py_base print(np.concatenate((wt,wt_true),axis=1)) print(np.concatenate((Py_est,Py_true),axis=1)) # print("||wt - wt_true||^2 = " + repr(np.sum((wt-wt_true)**2)/np.linalg.norm(wt_true)**2)) # print("KL(Py_est|| Py_true) = " + repr(stats.entropy(Py_est,Py_base))) #################################### # Solve weighted ERM and compare to previously trained models #################################### data_test = mx.io.NDArrayIter(Xtest, ytest, batch_size, shuffle=False) acc_unweighted = evaluate_accuracy(data_test, net, ctx, dfeat) # in fact, drawing confusion matrix maybe more informative print("Accuracy unweighted", acc_unweighted) training_weights=np.maximum(wt, 0) wt_ndarray = nd.array(training_weights,ctx=ctx) weightfunc = lambda x,y: wt_ndarray[y.asnumpy().astype(int)] # Train a model using the following! net2 = gluon.nn.HybridSequential() with net2.name_scope(): net2.add(gluon.nn.Dense(num_hidden, activation="relu")) net2.add(gluon.nn.Dense(num_hidden, activation="relu")) net2.add(gluon.nn.Dense(num_labels)) net2.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) trainer2 = gluon.Trainer(net2.collect_params(), 'sgd', {'learning_rate': .1}) net2.hybridize() # NOTE WE ASSUME SAME NUMBER OF EPOCHS IN PERIOD 1 and PERIOD 2 # Training weighted_train(net2, softmax_cross_entropy, trainer2, Xtrain, ytrain, Xval, yval, ctx, dfeat, epoch=epochs, weightfunc=weightfunc, data_ctx=data_ctx) data_test.reset() acc_weighted = evaluate_accuracy(data_test, net2, ctx, dfeat) print("Accuracy weighted", acc_weighted) return {"acc_unweighted": acc_unweighted, "acc_weighted": acc_weighted, "wt": wt, "wt_true": wt_true, "wt_l2": np.sum((wt-wt_true)**2)/np.linalg.norm(wt_true)**2, "kl_div": stats.entropy(Py_est,Py_base), "ypred_s": ypred_s, "ypred_s_soft": ypred_s_soft, "ypred_t:": ypred_t, "ypred_t_soft": ypred_t_soft, }
def calc_entropy(self): return entropy(list(map(lambda diag: diag.probability, self.diagnoses)))
def evaluate(model, z_dim, N=1000, cuda=True, batch_size=32, resize=True, splits=1): """ adapted from: https://github.com/sbarratt/inception-score-pytorch/blob/master/inception_score.py Computes the inception score of images generated by model model -- Pretrained Generator N -- Number of samples to test cuda -- whether or not to run on GPU batch_size -- batch size for feeding into Inception v3 splits -- number of splits """ assert batch_size > 0 assert N > batch_size # Set up dtype if cuda: dtype = torch.cuda.FloatTensor else: if torch.cuda.is_available(): print("WARNING: You have a CUDA device, so you should probably set cuda=True") dtype = torch.FloatTensor # Load inception model inception_model = inception_v3(pretrained=True, transform_input=False).type(dtype) inception_model.eval() up = nn.Upsample(size=(299, 299), mode='bilinear').type(dtype) def get_pred(N_s): z_ = torch.randn(N_s, z_dim).view(-1, z_dim, 1, 1) if cuda: z_ = z_.cuda() z_ = Variable(z_) x = model.forward(z_) if resize: x = up(x) x = inception_model(x) return F.softmax(x, dim=1).data.cpu().numpy() indexes = strided_app(np.arange(N), batch_size, batch_size) N = indexes[-1][-1] + 1 # Get predictions preds = np.zeros((N, 1000)) for i, idx in enumerate(indexes, 0): batch_size_i = idx.shape[0] preds[i * batch_size:i * batch_size + batch_size_i] = get_pred(batch_size_i) # Now compute the mean kl-div split_scores = [] for k in range(splits): part = preds[k * (N // splits): (k + 1) * (N // splits), :] py = np.mean(part, axis=0) scores = [] for i in range(part.shape[0]): pyx = part[i, :] scores.append(entropy(pyx, py)) split_scores.append(np.exp(np.mean(scores))) return np.mean(split_scores) #, np.std(split_scores)
def KL(p, q): KLD = entropy(p, q) return KLD
def KL_Classify(freq_dists): # A vs A AvsA_matrix = [] for i in range(0, len(freq_dists[0])): AxVsAy = [] for j in range(0, len(freq_dists[0])): d = entropy(freq_dists[0][i], freq_dists[0][j]) AxVsAy.append(d) AvsA_matrix.append(AxVsAy) # A vs B AvsB_matrix = [] for i in range(0, len(freq_dists[0])): AxVsBy = [] for j in range(0, len(freq_dists[1])): d = entropy(freq_dists[0][i], freq_dists[1][j]) AxVsBy.append(d) AvsB_matrix.append(AxVsBy) # B vs B BvsB_matrix = [] for i in range(0, len(freq_dists[1])): BxVsBy = [] for j in range(0, len(freq_dists[1])): d = entropy(freq_dists[1][i], freq_dists[1][j]) BxVsBy.append(d) BvsB_matrix.append(BxVsBy) # B vs A BvsA_matrix = [] for i in range(0, len(freq_dists[1])): BxVsAy = [] for j in range(0, len(freq_dists[0])): d = entropy(freq_dists[1][i], freq_dists[0][j]) BxVsAy.append(d) BvsA_matrix.append(BxVsAy) ########################## #Compute success metric #Set A - YouTube #Set B - CovertCast #TP = Correctly identify CovertCast #TN = Correctly identify YouTube ########################## total_KL_distances = 0 success = 0 TrueNegatives = 0 TruePositives = 0 #A - B for i in range(0, len(freq_dists[0])): for j in range(0, len(AvsA_matrix[i])): for k in range(0, len(AvsB_matrix[i])): total_KL_distances += 1 if (AvsA_matrix[i][j] < AvsB_matrix[i][k]): success += 1 TrueNegatives += 1 # B - A for i in range(0, len(freq_dists[1])): for j in range(0, len(BvsB_matrix[i])): for k in range(0, len(BvsA_matrix[i])): total_KL_distances += 1 if (BvsB_matrix[i][j] < BvsA_matrix[i][k]): success += 1 TruePositives += 1 print "Total Accuracy: " + str(success / float(total_KL_distances)) print "TruePositives: " + str( TruePositives / float(total_KL_distances / 2.0)) print "TrueNegatives: " + str( TrueNegatives / float(total_KL_distances / 2.0))
def entropy(self, arr): '''calculates the entropy of the normalized degree distribution''' return entropy(arr)
def map_ref_sites(routed: xr.Dataset, gauge_reference: xr.Dataset, gauge_sites=None, route_var = 'IRFroutedRunoff', fill_method='kldiv'): """ Assigns segs within routed boolean 'is_gauge' "identifiers" and what each seg's upstream and downstream reference seg designations are. Parameters ---------- routed: xr.Dataset Contains the input flow timeseries data. gauge_reference: xr.Dataset Contains reference flow timeseries data for the same watershed as the routed dataset. gauge_sites: list, optional If None, gauge_sites will be taken as all those listed in gauge_reference. route_var: str Variable name of flows used for fill_method purposes within routed. This is defaulted as 'IRFroutedRunoff'. fill_method: str While finding some upstream/downstream reference segs may be simple, (segs with 'is_gauge' = True are their own reference segs, others may be easy to find looking directly up or downstream), some river networks may have multiple options to select gauge sites and may fail to have upstream/downstream reference segs designated. 'fill_method' specifies how segs should be assigned upstream/downstream reference segs for bias correction if they are missed walking upstream or downstream. Currently supported methods: 'leave_null' nothing is done to fill missing reference segs, np.nan values are replaced with a -1 seg designation and that's it 'forward_fill' xarray's ffill method is used to fill in any np.nan values 'r2' reference segs are selected based on which reference site that seg's flows has the greatest r2 value with 'kldiv' reference segs are selected based on which reference site that seg's flows has the smallest KL Divergence value with 'kge' reference segs are selected based on which reference site that seg's flows has the greatest KGE value with Returns ------- routed: xr.Dataset Routed timeseries with reference gauge site river segments assigned to each river segement in the original routed. """ if isinstance(gauge_sites, type(None)): gauge_sites = gauge_reference['site'].values else: # need to typecheck since we do a for loop later and don't # want to end up iterating through a string by accident assert isinstance(gauge_sites, list) gauge_segs = gauge_reference.sel(site=gauge_sites)['seg'].values routed['is_gauge'] = False * routed['seg'] routed['down_ref_seg'] = np.nan * routed['seg'] routed['up_ref_seg'] = np.nan * routed['seg'] routed['up_seg'] = 0 * routed['is_headwaters'] routed['up_seg'].values = [find_up(routed, s) for s in routed['seg'].values] for s in routed['seg']: if s in list(gauge_segs): routed['is_gauge'].loc[{'seg':s}] = True routed['down_ref_seg'].loc[{'seg': s}] = s routed['up_ref_seg'].loc[{'seg': s}] = s for seg in routed['seg']: cur_seg = seg.values[()] while cur_seg in routed['seg'].values and np.isnan(routed['down_ref_seg'].sel(seg=cur_seg)): cur_seg = routed['down_seg'].sel(seg=cur_seg).values[()] if cur_seg in routed['seg'].values: routed['down_ref_seg'].loc[{'seg':seg}] = routed['down_ref_seg'].sel(seg=cur_seg).values[()] for seg in routed['seg']: cur_seg = seg.values[()] while cur_seg in routed['seg'].values and np.isnan(routed['up_ref_seg'].sel(seg=cur_seg)): cur_seg = routed['up_seg'].sel(seg=cur_seg).values[()] if cur_seg in routed['seg'].values: routed['up_ref_seg'].loc[{'seg':seg}] = routed['up_ref_seg'].sel(seg=cur_seg).values[()] # Fill in any remaining nulls (head/tailwaters) if fill_method == 'leave_null': # since there should be no -1 segs from mizuroute, we can set nan's to -1 to acknowledge # that they have been addressed and still set them apart from the rest of the data routed['up_ref_seg'] = (routed['up_ref_seg'].where(~np.isnan(routed['up_ref_seg']), other=-1)) routed['down_ref_seg'] = (routed['down_ref_seg'].where(~np.isnan(routed['down_ref_seg']), other=-1)) elif fill_method == 'forward_fill': routed['up_ref_seg'] = (routed['up_ref_seg'].where( ~np.isnan(routed['up_ref_seg']), other=routed['down_ref_seg'])).ffill('seg') routed['down_ref_seg'] = (routed['down_ref_seg'].where( ~np.isnan(routed['down_ref_seg']), other=routed['up_ref_seg'])).ffill('seg') elif fill_method == 'r2': fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0] fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0] routed['r2_up_gauge'] = 0 * routed['is_gauge'] routed['r2_down_gauge'] = 0 * routed['is_gauge'] for curr_seg in routed['seg'].values: up_ref_seg = np.nan curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values): up_ref_r2, up_ref_seg = find_max_r2(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['r2_up_gauge'].loc[{'seg':curr_seg}] = up_ref_r2 routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg else: # this seg has already been filled in, but r2 still needs to be calculated ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg)).values up_ref_r2 = np.corrcoef(curr_seg_flow, ref_flow)[0, 1]**2 routed['r2_up_gauge'].loc[{'seg':curr_seg}] = up_ref_r2 for curr_seg in routed['seg'].values: down_ref_seg = np.nan curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values): down_ref_r2, down_ref_seg = find_max_r2(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['r2_down_gauge'].loc[{'seg':curr_seg}] = down_ref_r2 routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg else: # this seg has already been filled in, but r2 still needs to be calculated ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg)).values down_ref_r2 = np.corrcoef(curr_seg_flow, ref_flow)[0, 1]**2 routed['r2_down_gauge'].loc[{'seg':curr_seg}] = down_ref_r2 elif fill_method == 'kldiv': fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0] fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0] routed['kldiv_up_gauge'] = 0 * routed['is_gauge'] routed['kldiv_down_gauge'] = 0 * routed['is_gauge'] for curr_seg in routed['seg'].values: curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values): up_ref_kldiv, up_ref_seg = find_min_kldiv(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['kldiv_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kldiv routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg else: # this seg has already been filled in, but kldiv still needs to be calculated # kldiv computation could probably be gutted in the furture ... TINY_VAL = 1e-6 total_bins = int(np.sqrt(len(curr_seg_flow))) curr_seg_flow_pdf, curr_seg_flow_edges = np.histogram( curr_seg_flow, bins=total_bins, density=True) curr_seg_flow_pdf[curr_seg_flow_pdf == 0] = TINY_VAL ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg).values).values ref_flow_pdf = np.histogram(ref_flow, bins=curr_seg_flow_edges, density=True)[0] ref_flow_pdf[ref_flow_pdf == 0] = TINY_VAL up_ref_kldiv = entropy(pk=ref_flow_pdf, qk=curr_seg_flow_pdf) routed['kldiv_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kldiv for curr_seg in routed['seg'].values: curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values): down_ref_kldiv, down_ref_seg = find_min_kldiv(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['kldiv_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kldiv routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg else: # this seg has already been filled in, but kldiv still needs to be calculated # kldiv computation could probably be gutted in the furture ... TINY_VAL = 1e-6 total_bins = int(np.sqrt(len(curr_seg_flow))) curr_seg_flow_pdf, curr_seg_flow_edges = np.histogram( curr_seg_flow, bins=total_bins, density=True) curr_seg_flow_pdf[curr_seg_flow_pdf == 0] = TINY_VAL ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg).values).values ref_flow_pdf = np.histogram(ref_flow, bins=curr_seg_flow_edges, density=True)[0] ref_flow_pdf[ref_flow_pdf == 0] = TINY_VAL down_ref_kldiv = entropy(pk=ref_flow_pdf, qk=curr_seg_flow_pdf) routed['kldiv_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kldiv elif fill_method == 'kge': fill_up_isegs = np.where(np.isnan(routed['up_ref_seg'].values))[0] fill_down_isegs = np.where(np.isnan(routed['down_ref_seg'].values))[0] routed['kge_up_gauge'] = 0 * routed['is_gauge'] routed['kge_down_gauge'] = 0 * routed['is_gauge'] for curr_seg in routed['seg'].values: up_ref_seg = np.nan curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['up_ref_seg'].sel(seg=curr_seg).values): up_ref_kge, up_ref_seg = find_max_kge(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['kge_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kge routed['up_ref_seg'].loc[{'seg':curr_seg}] = up_ref_seg else: # this seg has already been filled in, but kge still needs to be calculated ref_flow = routed[route_var].sel(seg=routed['up_ref_seg'].sel(seg=curr_seg)).values up_ref_kge = kling_gupta_efficiency(curr_seg_flow, ref_flow) routed['kge_up_gauge'].loc[{'seg':curr_seg}] = up_ref_kge for curr_seg in routed['seg'].values: down_ref_seg = np.nan curr_seg_flow = routed[route_var].sel(seg=curr_seg).values if np.isnan(routed['down_ref_seg'].sel(seg=curr_seg).values): down_ref_kge, down_ref_seg = find_max_kge(routed[route_var].sel(seg=gauge_segs), curr_seg_flow) routed['kge_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kge routed['down_ref_seg'].loc[{'seg':curr_seg}] = down_ref_seg else: # this seg has already been filled in, but kge still needs to be calculated ref_flow = routed[route_var].sel(seg=routed['down_ref_seg'].sel(seg=curr_seg)).values down_ref_kge = kling_gupta_efficiency(curr_seg_flow, ref_flow) routed['kge_down_gauge'].loc[{'seg':curr_seg}] = down_ref_kge else: raise ValueError('Invalid method provided for "fill_method"') return routed
def empirical_entropy_finite_support(dist, decimals=2): # Histogram of counts across the bins _, counts = np.unique(np.around(dist, decimals=decimals), return_counts=True, axis=0) return entropy(counts) # Scipy will normalise
# z_dimension = 301 # include z=0 campaign = sys.argv[1].split("/")[-1] win = b_train_origin > z_train_origin win_rate = win.sum() / record_size print("winning rate {0:.2f}%".format(win_rate * 100)) zs = list(range(z_dimension)) # calculate truth_pdf truth_pdf = [] (unique_z, counts_z) = np.unique(z_test_origin, return_counts=True) # the unique has been sorted unique_z = unique_z.tolist() for i in range(z_dimension): count = counts_z[unique_z.index( i)] if i in unique_z else 0 # in case of dividing 0 truth_pdf.append(count / test_size) # KMDT print("==========start to train KMDT==========") kmdt = KMDT(IFROOT=sys.argv[2], result_root=sys.argv[1], OFROOT=OFROOT, max_market_price=z_dimension) kmdt.train() mse_kmdt, anlp_kmdt, pdf_kmdt = kmdt.evaluate(x_test, z_test_origin) kl_pdf_kmdt = entropy(truth_pdf, pdf_kmdt) wd_pdf_kmdt = wasserstein_distance(truth_pdf, pdf_kmdt)
def Entropy(labels, base=2): probs = pd.Series(labels).value_counts() / len(labels) en = stats.entropy(probs, base=base) return en
def train(self): dataset = self.getTrainData() # priceSet = [int(data[self.PAY_PRICE_INDEX]) for data in dataset] iStack = [] iStack.append(1) dataStack = [] dataStack.append(dataset.copy()) while len(iStack) != 0: nodeIndex = iStack.pop() dataset = dataStack.pop() print("nodeIndex = " + str(nodeIndex)) if 2 * nodeIndex >= 2**self.TREE_DEPTH: self.nodeData[nodeIndex] = dataset.copy() continue maxKLD = -1.0 bestFeat = 0 count = 0 # detect if there's no feature to split for featIndex in range(len(dataset[0])): if featIndex not in self.FEATURE_LIST: continue s, winbids, losebids = self.dataset2s(dataset, featIndex) if len(s.keys()) <= 1: continue count += 1 tmpS1, tmpS2, winbids1, winbids2, losebids1, losebids2 = self.kmeans( s, winbids, losebids) q1 = self.calProbDistribution(winbids1, losebids1) q2 = self.calProbDistribution(winbids2, losebids2) KLD = entropy(q1, q2) if count == 1: maxKLD = KLD bestFeat = featIndex s1 = tmpS1.copy() s2 = tmpS2.copy() if maxKLD < KLD and len(tmpS1) != 0 and len(tmpS2) != 0: maxKLD = KLD bestFeat = featIndex s1 = tmpS1.copy() s2 = tmpS2.copy() if count == 0 or len(s1.keys()) == 0 or len( s2.keys()) == 0: # no feature can split self.nodeData[nodeIndex] = dataset.copy() continue dataset1 = self.s2dataset(s1, dataset, bestFeat) dataset2 = self.s2dataset(s2, dataset, bestFeat) if len(dataset1) < self.LEAF_SIZE or len( dataset2) < self.LEAF_SIZE: self.nodeData[nodeIndex] = dataset.copy() continue self.nodeInfos[nodeIndex] = NodeInfo(nodeIndex, bestFeat, maxKLD, list(s1.keys()).copy(), list(s2.keys()).copy()) if len(dataset2) > 2 * self.LEAF_SIZE: iStack.append(2 * nodeIndex + 1) dataStack.append(dataset2.copy()) else: self.nodeData[2 * nodeIndex + 1] = dataset2.copy() if len(dataset1) > 2 * self.LEAF_SIZE: iStack.append(2 * nodeIndex) dataStack.append(dataset1.copy()) else: self.nodeData[2 * nodeIndex] = dataset1.copy() return