def conditional_mutual_information2(data_x, data_y, data_z, bw_method=None): """ Conditional mutual information estimator. :param data_x: first distribution data :param data_y: conditioned variables data :param data_z: second distribution data :param bw_method: parameter of gaussian_kde of scipy :return: estimated conditional mutual information """ n_x, n_y, n_z = data_x.shape[1], data_y.shape[1], data_z.shape[1] assert n_x == 1 if data_y.shape[0] == 0: return mutual_information(data_x, data_z) y_distr = gaussian_kde(np.transpose(data_y), bw_method=bw_method) xy_distr = gaussian_kde(np.transpose(np.hstack([data_x, data_y])), bw_method=bw_method) xyz_distr = gaussian_kde(np.transpose(np.hstack([data_x, data_y, data_z])), bw_method=bw_method) z_distr = gaussian_kde(np.transpose(data_z), bw_method=bw_method) yz_distr = gaussian_kde(np.transpose(np.hstack([data_y, data_z])), bw_method=bw_method) def f(s): x_part = s[:n_x] y_part = s[n_x:n_y+n_x] xy_part = s[:n_x + n_y] yz_part = s[n_x:n_x+n_y+n_z] z_part = s[n_x+n_y:n_x+n_y+n_z] return log(xyz_distr(s)) + log(y_distr(y_part)) - log(xy_distr(xy_part)) - log(yz_distr(yz_part)) return monte_carlo_integration(lambda size=1: np.transpose(xyz_distr.resample(size=size)), f)
def hl_distances_from_set(A_list, B, points=65, margin_factor=0.25, bw=None): """ Calculates Hellinger distances of A_list sets from the set B using continuous formula. """ if bw is None: bw = B.shape[0]**(-1.0/5)*0.5 yBs = [] xs = [] for j in range(B.shape[1]): minx, maxx = B[:, j].min(), B[:, j].max() margin = (maxx-minx)*margin_factor minx -= margin maxx += margin xs.append(np.linspace(minx, maxx, points)) try: yBs.append(gaussian_kde(B[:, j], bw_method=bw)(xs[-1])) except (np.linalg.linalg.LinAlgError, ValueError) as _: print("Singular matrix -- unable to perform gaussian KDE.") yBs.append(np.zeros(xs[-1].shape)) for A in A_list: if A.shape[0] < 2: yield 1.0 else: integral = 1 for j, yB, x in zip(range(len(yBs)), yBs, xs): try: y = (np.sqrt(gaussian_kde(A[:, j], bw_method=bw)(x)) - np.sqrt(yB))**2 integral *= (1-0.5*simps(y, dx=(x[1]-x[0]))) del x, yB except np.linalg.linalg.LinAlgError: integral = 0.0 yield 1-integral
def distFunc(ys,xs): ''' # Calculate distance between two empirical distributions. # input parameters for model. # output: distance between generated data and data xs. ''' if (np.sum(ys)==0): return np.inf else: if xs.ndim == 1: kernely = stats.gaussian_kde(ys) kernelx = stats.gaussian_kde(xs) xx = np.linspace(np.min(xs),np.max(xs)) #range over data. return stats.entropy(kernelx(xx),qk=kernely(xx)) #KL-divergence. else: #dimensions are (npoints,nparams) to keep consistent with sci-kit #learn kernely = stats.gaussian_kde(ys.T) kernelx = stats.gaussian_kde(xs.T) #range over n-dimensional data (npoints,nparams) mesh = [np.linspace(np.min(xs[:,i]),np.max(xs[:,i])) for i in range(xs.shape[1])] xx = np.meshgrid(*mesh) xx = np.array([x.ravel() for x in xx]).T return stats.entropy(kernelx(xx.T),qk=kernely(xx.T)) #KL-divergence.
def save_report( report_path, prefix, decoys, targets, top_decoys, top_targets, cutoffs, svalues, qvalues, pvalues, lambda_ ): if plt is None: raise ImportError("you need matplotlib package to create a report") plt.figure(figsize=(10, 20)) plt.subplots_adjust(hspace=0.5) plt.subplot(511) plt.title(prefix + "\n\nROC") plt.xlabel("False Positive Rate (qvalue)") plt.ylabel("True Positive Rate (svalue)") plt.scatter(qvalues, svalues, s=3) plt.plot(qvalues, svalues) plt.subplot(512) plt.title("d_score Performance") plt.xlabel("dscore cutoff") plt.ylabel("rates") plt.scatter(cutoffs, svalues, color="g", s=3) plt.plot(cutoffs, svalues, color="g", label="TPR (svalue)") plt.scatter(cutoffs, qvalues, color="r", s=3) plt.plot(cutoffs, qvalues, color="r", label="FPR (qvalue)") plt.subplot(513) plt.title("Top Peak Groups' d_score Distributions") plt.xlabel("d_score") plt.ylabel("# of groups") plt.hist([top_targets, top_decoys], 20, color=["w", "r"], label=["target", "decoy"], histtype="bar") plt.legend(loc=2) plt.subplot(514) tdensity = gaussian_kde(top_targets) tdensity.covariance_factor = lambda: 0.25 tdensity._compute_covariance() ddensity = gaussian_kde(top_decoys) ddensity.covariance_factor = lambda: 0.25 ddensity._compute_covariance() xs = linspace(min(concatenate((top_targets, top_decoys))), max(concatenate((top_targets, top_decoys))), 200) plt.title("Top Peak Groups' d_score Density") plt.xlabel("d_score") plt.ylabel("density") plt.plot(xs, tdensity(xs), color="g", label="target") plt.plot(xs, ddensity(xs), color="r", label="decoy") plt.legend(loc=2) plt.subplot(515) if pvalues is not None: counts, __, __ = plt.hist(pvalues, bins=40) y_max = max(counts) plt.plot([lambda_, lambda_], [0, y_max], "r") plt.title("histogram pvalues") plt.savefig(report_path) return cutoffs, svalues, qvalues, top_targets, top_decoys
def conditional_mutual_information(data, x, y, z, bw_method=None): """ Conditional mutual information estimator. :param x: variables of the first distribution (list) :param y: conditioned variables (list) :param z: variables of the second distribution (list) :param bw_method: parameter of gaussian_kde of scipy :return: estimated conditional mutual information """ n_x, n_y, n_z = len(x), len(y), len(z) assert n_x == 1 if len(y) == 0: data_x = data[:, x] data_z = data[:, z] return mutual_information(data_x, data_z) y_distr = gaussian_kde(np.transpose(data[:, y]), bw_method=bw_method) xy_distr = gaussian_kde(np.transpose(data[:, x + y]), bw_method=bw_method) xyz_distr = gaussian_kde(np.transpose(data[:, x + y + z]), bw_method=bw_method) z_distr = gaussian_kde(np.transpose(data[:, z]), bw_method=bw_method) yz_distr = gaussian_kde(np.transpose(data[:, y + z]), bw_method=bw_method) def f(s): x_part = s[:n_x] y_part = s[n_x:n_y+n_x] xy_part = s[:n_x + n_y] yz_part = s[n_x:n_x+n_y+n_z] z_part = s[n_x+n_y:n_x+n_y+n_z] return log(xyz_distr(s)) + log(y_distr(y_part)) - log(xy_distr(xy_part)) - log(yz_distr(yz_part)) return monte_carlo_integration(lambda size=1: np.transpose(xyz_distr.resample(size=size)), f)
def test_plot(): import math from numpy.random import normal from scipy import stats global data def f(x): return 2*x + 1 mean = 2 var = 3 std = math.sqrt(var) data = normal(loc=2, scale=std, size=50000) d2 = f(data) n = scipy.stats.norm(mean, std) kde1 = stats.gaussian_kde(data, bw_method='silverman') kde2 = stats.gaussian_kde(d2, bw_method='silverman') xs = np.linspace(-10, 10, num=200) #plt.plot(data) plt.plot(xs, kde1(xs)) plt.plot(xs, kde2(xs)) plt.plot(xs, n.pdf(xs), color='k') num_bins=100 h = np.histogram(data, num_bins, density=True) plt.plot(h[1][1:], h[0], lw=4) h = np.histogram(d2, num_bins, density=True) plt.plot(h[1][1:], h[0], lw=4)
def kde_opt4(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] + df["hour"] / 24. df_new["accuracy"] = df["accuracy"].apply(lambda x: np.log10(x)) df_new["x"] = df["x"] df_new["y"] = df["y"] return df_new logging.info("train kde_opt4 model") df_cell_train_feats_kde = prepare_feats(df_cell_train_feats) df_cell_test_feats_kde = prepare_feats(df_cell_test_feats) n_class = len(np.unique(y_train)) y_test_pred = np.zeros((len(df_cell_test_feats_kde), n_class), "d") for i in range(n_class): X = df_cell_train_feats_kde[y_train == i] y_test_pred_i = np.ones(len(df_cell_test_feats_kde), "d") for feat in df_cell_train_feats_kde.columns.values: X_feat = X[feat].values BGK10_output = kdeBGK10(X_feat) if BGK10_output is None: kde = gaussian_kde(X_feat, "scott") kde = gaussian_kde(X_feat, kde.factor * 0.741379) y_test_pred_i *= kde.evaluate(df_cell_test_feats_kde[feat].values) else: bandwidth, mesh, density = BGK10_output kde = KernelDensity(kernel='gaussian', metric='manhattan', bandwidth=bandwidth) kde.fit(X_feat[:, np.newaxis]) y_test_pred_i *= np.exp(kde.score_samples(df_cell_test_feats_kde[feat].values[:, np.newaxis])) y_test_pred[:, i] += y_test_pred_i return y_test_pred
def generate_animation(train_filename, test_filename, encrypted_key, sequence): pairs = [chr1 + chr2 for chr1 in ALPHABET for chr2 in ALPHABET] freq1 = count_bigram_frequency(read_and_simply_text(train_filename)) freq2 = count_bigram_frequency(encrypt_by_key_substitution(read_and_simply_text(test_filename), encrypted_key)) # desired key dct = dict(zip(pairs, [chr1 + chr2 for chr1 in encrypted_key for chr2 in encrypted_key])) data = [math.log(freq1[pair]*freq2[dct[pair]]) for pair in pairs] density = gaussian_kde(data) xs = np.linspace(0,8,200) density.covariance_factor = lambda : .25 density._compute_covariance() plt.ion() fig = plt.figure() ax = fig.add_subplot(111) final = density(xs) line1, = ax.plot(xs, final, 'b') line2, = ax.plot(xs, final, 'r') for key in sequence: # inverting the key dct = dict(zip(key, ALPHABET)) key = [dct[chr] for chr in ALPHABET] # finding similarity between key dct = dict(zip(pairs, [chr1 + chr2 for chr1 in key for chr2 in key])) data1 = [math.log(freq1[pair]*freq2[dct[pair]]) for pair in pairs] density = gaussian_kde(data1) xs = np.linspace(0,8,200) density.covariance_factor = lambda : .25 density._compute_covariance() line1.set_ydata(density(xs)) fig.canvas.draw() time.sleep(0.5)
def plot_data_comb_2D(self, results_path, file_n, data, fit, timepoints): pp = PdfPages(results_path+'/'+file_n) cc = 0 for tp in timepoints: xmin, xmax = -3, 3 ymin, ymax = -3, 3 xx, yy = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions = vstack([xx.ravel(), yy.ravel()]) values = vstack([ log10(1+data[tp][:, 0]), log10(1+data[tp][:, 1])]) kernel = st.gaussian_kde(values) f = reshape(kernel(positions).T, xx.shape) xxf, yyf = mgrid[xmin:xmax:100j, ymin:ymax:100j] positions_f = vstack([xxf.ravel(), yyf.ravel()]) values_f = vstack([log10(1+fit[tp][:, 0]), log10(1+fit[tp][:, 1])]) kernel_f = st.gaussian_kde(values_f) ff = reshape(kernel_f(positions_f).T, xxf.shape) ax = plt.subplot(4, 5, cc+1) ax.contourf(xx, yy, f, cmap='Blues') ax.contourf(xxf, yyf, ff, cmap='Reds') ax.set_xlim([-1, 3]) ax.set_ylim([-1, 3]) cc += 1 pp.savefig() plt.close() pp.close()
def figures(): data = np.genfromtxt('examscores.csv') sigma2 = 25. mu_0 = 80. sigma2_0 = 16. alpha = 3. beta = 50. mu_samples, sigma2_samples = gibbs(data, sigma2, mu_0, sigma2_0, alpha, beta) mukernel = gaussian_kde(mu_samples) x_min = min(mu_samples) - 1. x_max = max(mu_samples) + 1. x = np.arange(x_min, x_max, step=.1) plt.plot(x, mukernel(x)) plt.savefig("mu_posterior.pdf") plt.clf() sigma2kernel = gaussian_kde(sigma2_samples) x_min = min(sigma2_samples) - 1. x_max = max(sigma2_samples) + 1. x = np.arange(x_min, x_max, step=1.) plt.plot(x, sigma2kernel(x)) plt.savefig("sigma2_posterior.pdf") plt.clf() score_samples = np.array([norm.rvs(mu_sample, np.math.sqrt(sigma2_sample)) for mu_sample, sigma2_sample in zip(mu_samples, sigma2_samples)]) score_kernel = gaussian_kde(score_samples) x_min = min(score_samples) - 1. x_max = max(score_samples) + 1. plt.plot(x, score_kernel(x)) plt.savefig("predictiveposterior.pdf")
def add_kernel_density_estimate(data, graph=None, filename=None): data = np.array(data) fig,axis_eruptions,axis_waiting = None,None,None if graph is None: fig, (axis_eruptions,axis_waiting) = pyplot.subplots(1,2,sharex=False,sharey=False) _set_axis_properties(axis_eruptions,axis_waiting) else: fig,axis_eruptions,axis_waiting = graph axis_eruptions = axis_eruptions.twinx() axis_waiting = axis_waiting.twinx() fig.subplots_adjust(wspace=0.5) axis_eruptions.set_ylabel('Density') axis_waiting.set_ylabel('Density') density_eruptions = stats.gaussian_kde(data[:,0]) density_waiting = stats.gaussian_kde(data[:,1]) x_eruptions = np.arange(0.,6,0.05) axis_eruptions.plot(x_eruptions, density_eruptions(x_eruptions),'k-') x_waiting = np.arange(40.,100.,0.5) axis_waiting.plot(x_waiting, density_waiting(x_waiting), 'k-') if filename is not None: fig.savefig(filename)
def _make_kde(self, conf=0.95): self.durkde = gaussian_kde(self.durs) self.depthkde = gaussian_kde(self.deps) self.slopekde = gaussian_kde(self.slopes) self.logdepthkde = gaussian_kde(self.logdeps) if self.fit_converged: try: durconf = kdeconf(self.durkde, conf) depconf = kdeconf(self.depthkde, conf) logdepconf = kdeconf(self.logdepthkde, conf) slopeconf = kdeconf(self.slopekde, conf) except: raise raise MCMCError("Error generating confidence intervals...fit must not have worked.") durmed = np.median(self.durs) depmed = np.median(self.deps) logdepmed = np.median(self.logdeps) slopemed = np.median(self.slopes) self.durfit = (durmed, np.array([durmed - durconf[0], durconf[1] - durmed])) self.depthfit = (depmed, np.array([depmed - depconf[0], depconf[1] - depmed])) self.logdepthfit = (logdepmed, np.array([logdepmed - logdepconf[0], logdepconf[1] - logdepmed])) self.slopefit = (slopemed, np.array([slopemed - slopeconf[0], slopeconf[1] - slopemed])) else: self.durfit = (np.nan, (np.nan, np.nan)) self.depthfit = (np.nan, (np.nan, np.nan)) self.logdepthfit = (np.nan, (np.nan, np.nan)) self.slopefit = (np.nan, (np.nan, np.nan)) points = np.array([self.durs, self.logdeps, self.slopes]) self.kde = gaussian_kde(points)
def summary_stats(self, data): """Returns tuple containing summary statistics named in summary_stat_names """ if data is None: return [np.nan]*len(self.summary_stat_names) N = len(data) min_logP, max_logP = np.log(self.min_period), np.log(self.max_period) logP_grid = np.linspace(min_logP, max_logP, 1000) if N > 1: k = gaussian_kde(np.log(data.period.values)) logP_pdf = k(logP_grid) else: logP_pdf = np.ones(len(logP_grid))*1./(max_logP - min_logP) logd_grid = np.linspace(-4, 0, 1000) if N > 1: k = gaussian_kde(data.logd_pri) logd_pdf = k(logd_grid) else: logd_pdf = np.ones(len(logd_grid))*1./(4) phase_sec = data.phase_sec.dropna().values return logP_pdf, N, phase_sec, logd_pdf
def update(self, mu, weight): assert 0 <= weight <= 1 if weight == 1: self.definite_points.append(mu) else: self.possible_points.append((mu, weight)) # just keep the PRIOR distribution if self.definite_points == []: return if self.possible_points != []: # turn into an array for numpy's purposes a = np.array(self.possible_points) # want to keep if weight is greater than random number mask = a[:, 1] > np.random.rand(len(self.possible_points)) sampled = a[:, 0][mask] if (sampled != []): points = np.concatenate((np.array(self.definite_points), np.array(sampled))) else: points = np.array(self.definite_points) else: points = np.array(self.definite_points) if points.size > 1: #print "points: " + str(points) self.distribution = gaussian_kde(points) else: # THIS IS AN UGLY HACK, need 2 pts initially points = np.array([points[0] - OFFSET, points[0] + OFFSET]) self.distribution = gaussian_kde(points) self.distribution = None
def lookatresults(data, modes, theta=None, vert=False, labels=None): P = data[-1][0] n = P.shape[0] if labels == None: labels = [""] * n else: pass if vert == True: subplots = range(n*100+11,n*100+n+11,1) figsize = (6, 3*n) elif vert == 'four': subplots = [221, 222, 223, 224] figsize = (10, 10) else: subplots = range(100+n*10+1,100+n*10+1+n,1) figsize = (5*n, 3) f = stats.gaussian_kde(data[-1][0]) int_guess = np.mean(data[-1][0], axis=1) modes = minimize(neg, int_guess, args=(f)).x thetas = [] P = data[-1][0] labelpad = 20 for i in xrange(n): x = P[i] t = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format( modes[i]-stats.scoreatpercentile(x, 16), modes[i], stats.scoreatpercentile(x, 84)-modes[i], i+1) thetas.append(t) if P.shape[1] > 10: bins = np.sqrt(P.shape[1]) else: bins=10 fig = plt.figure(figsize=figsize) for i in xrange(n): print subplots[i] plt.subplot(int(subplots[i])) #plt.title(thetas[0]) ker = stats.gaussian_kde(P[i]) h = plt.hist(P[i], bins=bins, normed=True, alpha=1) x = np.linspace(h[1][0],h[1][-1],1000) plt.plot(x,ker(x)) plt.xlabel(labels[i], labelpad=labelpad, fontsize=24) if theta != None: plt.axvline(theta[0]) for t in thetas: print t return fig
def test(ens, x_train, y_train, train_spread, x_test, y_test, test_spread, moneyline): # find training error plot = False # Currently moneyline is set to false. Not sure how to set to true ens.predict(x_train, train=True) a = ens.blend() ens.validate(y_train, train_spread, False, moneyline) ens.predict(x_test, train=False) b = ens.blend() c = ens.validate(y_test, test_spread, True, moneyline) if plot: density = gaussian_kde(a) xs = np.linspace(-20, 20, 200) density.covariance_factor = lambda: 0.1 density._compute_covariance() plt.plot(xs, density(xs)) density = gaussian_kde(b) xs = np.linspace(-20, 20, 200) density.covariance_factor = lambda: 0.1 density._compute_covariance() plt.plot(xs, density(xs)) return c
def kde_minmode(data,x,max_num_mode,min_mode_pdf): kde=gaussian_kde(data) f=kde.factor f_list=np.linspace(f,(data.max()-data.min()),100) s=UnivariateSpline(x,kde(x),s=0) s1=UnivariateSpline(x,s(x,1),s=0) s2=UnivariateSpline(x,s1(x,1),s=0) extrema=s1.roots() maxima=extrema[np.where((s2(extrema)<0)*(s(extrema)>=min_mode_pdf))] if len(maxima)>max_num_mode: for q in range(1,len(f_list)): f=f_list[q] kde2=gaussian_kde(data,bw_method=f) s=UnivariateSpline(x,kde2(x),s=0) s1=UnivariateSpline(x,s(x,1),s=0) s2=UnivariateSpline(x,s1(x,1),s=0) extrema=s1.roots() maxima=extrema[np.where((s2(extrema)<0)*(s(extrema)>=min_mode_pdf))] if len(maxima)<=max_num_mode: ## print 'modes: ',maxima break kde=gaussian_kde(data,bw_method=f) ## else: ## print maxima return kde,maxima
def pdfcalcs(x_pred, x_hist, y_hist): """Calculates the PDFs required to calculate transfer entropy. Currently only supports k = 1; l = 1 """ # TODO: Generalize for k and l # Get dimensions of vectors # k = np.size(x_hist[:, 1]) # l = np.size(y_hist[:, 1]) # Calculate p(x_{i+h}, x_i, y_i) data_1 = np.vstack([x_pred, x_hist[0, :], y_hist[0, :]]) pdf_1 = stats.gaussian_kde(data_1, 'silverman') # Calculate p(x_i, y_i) data_2 = np.vstack([x_hist[0, :], y_hist[0, :]]) pdf_2 = stats.gaussian_kde(data_2, 'silverman') # Calculate p(x_{i+h}, x_i) data_3 = np.vstack([x_pred, x_hist[0, :]]) pdf_3 = stats.gaussian_kde(data_3, 'silverman') # Calculate p(x_i) data_4 = x_hist[0, :] pdf_4 = stats.gaussian_kde(data_4, 'silverman') return pdf_1, pdf_2, pdf_3, pdf_4
def traceplot(traces, thin, burn): ''' Plot parameter estimates for different levels of the model into the same plots. Black lines are individual observers and red lines are mean estimates. ''' variables = ['Slope1', 'Slope2', 'Offset', 'Split'] for i, var in enumerate(variables): plt.subplot(2, 2, i + 1) vals = get_values(traces, var, thin, burn) dim = (vals.min() - vals.std(), vals.max() + vals.std()) x = plt.linspace(*dim, num=1000) for v in vals.T: a = gaussian_kde(v) y = a.evaluate(x) y = y / y.max() plt.plot(x, y, 'k', alpha=.5) try: vals = get_values(traces, 'Mean_' + var, thin, burn) a = gaussian_kde(vals) y = a.evaluate(x) y = y / y.max() plt.plot(x, y, 'r', alpha=.75) except KeyError: pass plt.ylim([0, 1.1]) plt.yticks([0]) sns.despine(offset=5, trim=True) plt.title(var)
def set_plx_kde(t, bandwidth=0.3, method='sklearn_kde'): """ Set the plx_kde Parameters ---------- t : ndarray float Catalog of parallax measures (units: mas) bandwidth : float Bandwidth for gaussian_kde (optional, 0.01 recommended) method : string Method for density determination (options: scipy_kde, sklearn_kde, blocks) """ global plx_kde if method is 'scipy_kde': if plx_kde is None: # We are only going to allow parallaxes above some minimum value if bandwidth is None: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0]) else: plx_kde = gaussian_kde(t['plx'][t['plx']>0.0], bw_method=bandwidth) elif method is 'sklearn_kde': if plx_kde is None: kwargs = {'kernel':'tophat'} if bandwidth is None: plx_kde = KernelDensity(**kwargs) else: plx_kde = KernelDensity(bandwidth=bandwidth, **kwargs) if c.kde_subset: plx_ran = np.copy(t['plx'][t['plx']>0.0]) np.random.shuffle(plx_ran) plx_kde.fit( plx_ran[0:5000, np.newaxis] ) else: plx_kde.fit( t['plx'][t['plx']>0.0][:, np.newaxis] ) elif method is 'blocks': global plx_bins_blocks global plx_hist_blocks # Set up Bayesian Blocks print("Calculating Bayesian Blocks...") nbins = np.min([len(t), 40000]) bins = bayesian_blocks(t['plx'][t['plx']>0.0][0:nbins]) hist, bins = np.histogram(t['plx'][t['plx']>0.0][0:nbins], bins=bins, normed=True) # Pad with zeros plx_bins_blocks = np.append(-1.0e100, bins) hist_pad = np.append(0.0, hist) plx_hist_blocks = np.append(hist_pad, 0.0) print("Bayesian Blocks set.") else: print("You must include a valid method") print("Options: kde or blocks") return
def lookatresults(data, name, modes): plots, thetas = [], [] P = data[-1][0] for i in xrange(len(P)): x = P[i] theta = r'$\theta_{3:}$ {1:.2f} +{2:.2f}/-{0:.2f}'.format( modes[i]-stats.scoreatpercentile(x, 16), modes[i], stats.scoreatpercentile(x, 84)-modes[i], i+1) thetas.append(r'$\theta_{}$'.format(i+1)) f = plt.figure() plt.suptitle(name) plt.subplot(111) plt.title(theta) ker = stats.gaussian_kde(x) plt.hist(x, normed=True, alpha=0.2) X = np.linspace(0.0, max(x) + .1*max(x), 1000) plt.plot(X,ker(X)) plt.xlabel(r"$\theta_{}$".format(i+1)) #plt.savefig('theta_{}.png'.format(i)) plots.append(f) f = plt.figure() plt.subplot(211) plt.plot(data['epsilon'], 'o-') plt.title(r'$\epsilon$') plt.subplot(212) plt.plot(data['n total'], 'o-') plt.title('N Trials') plots.append(f) alphas = np.linspace(0, 1, data.size) for j in xrange(len(data[0][0])): f = plt.figure() for i, D in enumerate(data): F = stats.gaussian_kde(D[0][j]) x = np.linspace(D[0][j].min(), D[0][j].max(), 300) plt.plot(x, F(x), alpha=alphas[i]) plt.xlabel(r"$\theta_{}$".format(j+1)) if i == data.size - 1: plt.plot(x, F(x), c='m', ls='--', lw=2, zorder=1) plots.append(f) plt.figure() f = triangle.corner(P.T, labels=thetas) #plt.savefig('trianle.png'.format(i)) plots.append(f) return plots
def bootStrap( lofvars, homvars, tlength, targetdir, prefix ) : samplings = [] ind = np.linspace(0,100,512) kde = gaussian_kde( homvars[homvars.PSI != "-"].PSI.map(float).tolist() ) for boot in range(0,1000) : kdesub = gaussian_kde( kde.resample(tlength) ) kdedf = DataFrame( {"subsetname":"Random%d" % boot, "Density":kdesub.evaluate(ind), "PSI":ind} ) samplings.append( kdedf ) samplings = concat( samplings ).reset_index(drop=True) quants = samplings.groupby(["PSI"])["Density"].quantile([.025,.5,.975]).reset_index() quants.rename(columns={'level_1':"Quantile",0:"Density"}, inplace=True ) quants["linetype"] = ["Mean" if x == .5 else "95% threshold" for x in quants.Quantile] if "vclass" not in lofvars.columns.tolist() : lofvars["vclass"] == "LoF" lofvars_sub = lofvars[lofvars.PSI != "-"].copy() lofvars_sub.PSI = lofvars_sub.PSI.astype(float) lofdf = [] for vclass,lofclass in lofvars_sub.groupby("vclass") : kde = gaussian_kde( lofvars_sub[lofvars_sub.vclass == vclass].PSI.tolist() ) tmpdf = DataFrame({"vclass":vclass, "Density":kde.evaluate(ind), "PSI":ind}) lofdf.append( tmpdf ) lofdf = concat( lofdf ).reset_index(drop=True) rsamplings = com.convert_to_r_dataframe(samplings) rlofvars = com.convert_to_r_dataframe(lofdf) rquants = com.convert_to_r_dataframe(quants) rquants = fixRLevels( rquants,"linetype", ["Mean","95% threshold"] ) #r_pvals = com.convert_to_r_dataframe(pvals) p = (ggplot2.ggplot(rlofvars) + ggplot2.aes_string(x="PSI",y="Density") + #,group="vclass" ggplot2.geom_line( ggplot2.aes_string(x="PSI", y="Density", group="factor(subsetname)"), color="grey", data=rsamplings ) + ggplot2.geom_line( ggplot2.aes_string(x="PSI",y="Density",linetype="factor(linetype)", group="factor(Quantile)"), color="black", data=rquants ) + ggplot2.geom_line( ggplot2.aes_string(color="factor(vclass)") ) + #ggplot2.geom_density(ggplot2.aes_string(colour="factor(vclass)"),size=1.5,color="blue") + ggplot2.scale_y_continuous("Density") + #ggplot2.scale_x_continuous("PSI") + ggplot2.scale_linetype("Confidence Interval") + ggplot2.scale_colour_brewer("Variant Type",palette="Set1") + #ggplot2.theme(**{'legend.position':"none"}) + #ggplot2.ggtitle("PSI distribution") + #ggplot2.scale_colour_brewer("Variant Type",palette="Set1") + #ggplot2.scale_x_discrete("ME AF") + ggplot2.theme(**mytheme) ) #ggplot2.stat_smooth(method="lm", se=False)+ figname = "%s/%s_psibootstrap2.pdf" % (targetdir,prefix) print "Writing file:",figname grdevices.pdf(figname, width=5, height=4) p.plot() grdevices.dev_off()
def test_weights_intact(): # regression test for gh-9709: weights are not modified np.random.seed(12345) vals = np.random.lognormal(size=100) weights = np.random.choice([1.0, 10.0, 100], size=vals.size) orig_weights = weights.copy() stats.gaussian_kde(np.log10(vals), weights=weights) assert_allclose(weights, orig_weights, atol=1e-14, rtol=1e-14)
def violin_plot(bl, dL, ax, clr='blue', alpha=.75, percentiles=[25, 75], corrections=False, **kwargs): """Creates a violin plot and sets properties.""" hl, x, y = hist_axis(bl, dL, **kwargs) y2 = np.array([t['med'] for t in hl]) x2 = np.array([s['sliceMed'] for s in hl]) ind = ((y > 0) & (x > 0)) | ((y < 0) & (x < 0)) y = y[ind] x = x[ind] lim = max(np.max(np.abs(x2)), np.max(np.abs(y2)))*1.1 dataset = [s['data'] for s in hl] p_data = [np.percentile(s, percentiles) for s in dataset] refined_dataset = [x[((x > p[0]) & (x < p[1]))] for x, p in zip(dataset, p_data)] violin_widths = .6*(max(x2) - min(x2))/len(y2) violins = ax.violinplot(refined_dataset, widths=violin_widths, showmedians=True, showextrema=False, positions=x2) ax.set_xlim(-lim, lim) ax.set_ylim(-lim, lim) ax.set(adjustable='box-forced', aspect='equal') add_identity(ax, color='.3', ls='-', linewidth=2, zorder=1) for vio in violins['bodies']: vio.set(facecolor=clr, alpha=alpha) violins['cmedians'].set(edgecolor='red') violins['cmedians'].set_linewidth(2.5) if corrections: total_kernel = gaussian_kde(y) xspace = np.linspace(np.nanmin(x), np.nanmax(x), 1000) total_kernel_array = total_kernel.evaluate(xspace) gaussian_means = [] gaussian_stds = [] for bin in hl: bin_kernel = gaussian_kde(bin['data']) bin_kernel_array = bin_kernel.evaluate(xspace) divided_dist = bin_kernel_array/np.sqrt(total_kernel_array) max_ind = np.argmax(bin_kernel_array) ind_valid = (divided_dist < 1e10) # ind_valid = (divided_dist < divided_dist[max_ind]) try: popt = scipy.optimize.least_squares(gaussian_func, [np.abs(bin['sliceMed']), np.abs(bin['sliceMed'] / 5), bin['sliceMed']], args=(xspace[ind_valid], divided_dist[ind_valid]), jac='3-point', x_scale='jac', loss='soft_l1', f_scale=.1).x # popt, pcov = curve_fit(gaussian, xspace[ind_valid], divided_dist[ind_valid], # p0=[np.abs(bin['sliceMed']), np.abs(bin['sliceMed']/5), bin['sliceMed']], maxfev=10000) except: popt = np.array([np.nan, np.nan, np.nan]) gaussian_means.append(popt[2]) gaussian_stds.append(popt[1]) ax.plot(x2, gaussian_means, color='black', linestyle='None', marker='.', ms=15, alpha=.6, zorder=10) for std, mean, x_pos in zip(gaussian_stds, gaussian_means, x2): ax.plot([x_pos, x_pos], [mean + std, mean - std], 'k-', alpha=.6, zorder=10) return hl
def plot_accuracy(counts, labels, ntrain, cpu): oaa = sum(counts) / len(counts) cotr = counts[ntrain > 0] acc = sum(cotr) / len(cotr) cput = sum(cpu) / 60 print('\nOverall accuracy: %.2f' % oaa) print('Accuracy given at least 1 training sample: %.2f' % acc) print('CPU time: %0.2f min' % cput) d = {} for lbl, cor, ntr, in zip(labels, counts, ntrain): if lbl in d: d[lbl][0].append(cor) d[lbl][1].append(ntr) else: d[lbl] = ([cor], [ntr]) print('') cte = np.array([sum(x[0]) for x in d.values()]) num = np.array([len(x[0]) for x in d.values()]) y = cte / num x = np.array([x[1][0] for x in d.values()]) x2 = np.array([len(x) for x in d]) fig = plt.figure() ax = fig.add_subplot(121) # ax.plot(x, y, '.') xy = np.vstack([x, y]) z = gaussian_kde(xy)(xy) sc = ax.scatter(x, y, c=z, s=100, edgecolor='') ax.set_ylim([-0.05, 1.05]) ax.set_xlim([-5, max(x) + 5]) plt.grid() plt.xlabel('# training samples (for given label)') plt.ylabel('accuracy') cbar = plt.colorbar(sc) cbar.ax.set_ylabel('label density') # cbar.set_ticks([0, 0.25, 0.5, 0.75, 1]) # cbar.set_ticklabels(['0', '0.25', '0.5', '0.75', '1'], update_ticks=True) ax2 = fig.add_subplot(122) xy2 = np.vstack([x2, y]) z2 = gaussian_kde(xy2)(xy2) sc2 = ax2.scatter(x2, y, c=z2, s=100, edgecolor='') ax2.set_ylim([-0.05, 1.05]) ax2.set_xlim([-5, max(x2) + 5]) plt.grid() plt.xlabel('word length') plt.ylabel('accuracy') cbar = plt.colorbar(sc2) cbar.ax.set_ylabel('label density') plt.show()
def fit(self, X, y): def jitter(x, range): y = np.copy(x) scale_exp_min = np.abs(np.ceil(np.log10(range[0]))) scale_exp_max = np.abs(np.ceil(np.log10(range[1]))) scale_exp = (scale_exp_max + scale_exp_min) / 2. r = np.random.rand(y.size) / (10**scale_exp) y = y + r return y # Print msg. when going into gcp.fit strMessage = "rows in X = %d, r_minimum = %d" % (X.shape[0], self.r_minimum) logger.debug(strMessage) # Use X and y to train a Gaussian Copula Process. super(GCP, self).fit(X, y) # skip training the process if there aren't enough samples if X.shape[0] < self.r_minimum: return # -- Non-parametric model of 'y', estimated with kernel density kernel_pdf = st.gaussian_kde(y) kernel_cdf = make_cdf(kernel_pdf) kernel_ppf = make_ppf(kernel_pdf) y_kernel_model = {'pdf': kernel_pdf, 'cdf': kernel_cdf, 'ppf': kernel_ppf} self.y_kernel_model = y_kernel_model # - Transform y-->F-->vF-->norm.ppf-->v vF = y_kernel_model['cdf'](y) v = st.norm.ppf(vF) # -- Non-parametric model of each feature in 'X', estimated with kernel density X_kernel_model = [] for ki in range(X.shape[1]): columnX = X[:, ki] if self.tunables[ki][1].is_integer: columnX = jitter(columnX, self.tunables[ki][1].range) kernel_pdf = st.gaussian_kde(columnX) kernel_cdf = make_cdf(kernel_pdf) kernel_ppf = make_ppf(kernel_pdf) kernel_model = {'pdf': kernel_pdf, 'cdf': kernel_cdf, 'ppf': kernel_ppf} X_kernel_model.append(kernel_model) self.X_kernel_model = X_kernel_model # -- Transform X-->F-->uF-->norm.ppf-->U U = np.empty_like(X) for ki in range(X.shape[1]): uF = X_kernel_model[ki]['cdf'](X[:, ki]) U[:, ki] = st.norm.ppf(uF) # - Instantiate a GP and fit it with (U, v) self.gcp = GaussianProcessRegressor(normalize_y=True) self.gcp.fit(U, v)
def plot_rmse(working_directory): figure_dir = os.path.join(working_directory, 'Figures') if not os.path.exists(figure_dir): os.makedirs(figure_dir, exist_ok=True) x_grid = np.arange(0, 360, 10) correct = star.get_EAs_from_star(os.path.join( working_directory, 'exp_projections.star')) plt.figure(0) first = star.get_EAs_from_star(os.path.join( working_directory, 'it000', 'orientations.star')) correct_rmse = calc_rmse(correct, first) # plt.hist(correct_rmse)] correct_kde = gaussian_kde(correct_rmse) plt.plot(x_grid, correct_kde.evaluate(x_grid)) # plt.ylim([0, 1]) plt.xlabel('RMSE for 3 Euler angles') plt.ylabel('Count') plt.title('Compare with correct angle distribution') plt.savefig(os.path.join(figure_dir, 'it000'), dpi=150) exp_folder = glob.glob(os.path.join(working_directory, 'it*')) last = star.get_EAs_from_star(os.path.join(exp_folder[0], 'orientations.star')) exp_folder.pop(0) for i, folder in enumerate(exp_folder, start=1): now = star.get_EAs_from_star(os.path.join( folder, 'orientations.star')) correct_rmse = calc_rmse(correct, now) last_rmse = calc_rmse(last, now) last = now fig = plt.figure(num=i, figsize=(16, 6)) gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1]) plt.suptitle('Iteration: {0}'.format(i)) plt.subplot(gs[0]) # plt.hist(correct_rmse) correct_kde = gaussian_kde(correct_rmse) plt.plot(x_grid, correct_kde.evaluate(x_grid)) plt.xlabel('RMSE for 3 Euler angles') plt.ylabel('Count') plt.title('Compare with correct angle distribution') plt.subplot(gs[1]) # plt.hist(last_rmse) last_kde = gaussian_kde(last_rmse) plt.plot(x_grid, last_kde.evaluate(x_grid)) plt.xlabel('RMSE for 3 Euler angles') plt.ylabel('Count') plt.title('Compare with angle distribution of last iteration') plt.savefig(os.path.join(figure_dir, 'it' + str(i).zfill(3)), dpi=150, bbox_inches='tight') plt.close(fig)
def test_weights_integer(): # integer weights are OK, cf gh-9709 (comment) np.random.seed(12345) values = [0.2, 13.5, 21.0, 75.0, 99.0] weights = [1, 2, 4, 8, 16] # a list of integers pdf_i = stats.gaussian_kde(values, weights=weights) pdf_f = stats.gaussian_kde(values, weights=np.float64(weights)) xn = [0.3, 11, 88] assert_allclose(pdf_i.evaluate(xn), pdf_f.evaluate(xn), atol=1e-14, rtol=1e-14)
def plotAffVersusUnaff(intronret, affset, unaffset, figuredir, gfftype, dtype): print "Running plotAffVersusUnaff" inretmat = intronret.loc[:, affset+unaffset] maxx = intronret["logRII"].max() AvUsamplings = [] # List of density distributions for each comparison ind = np.linspace(intronret["logRII"].min(), maxx, 512) for aff in affset: for unaff in unaffset: print "A:", aff, "versus", "U:", unaff newriivals = inretmat.apply(lambda row: calcRII(row, [aff], [unaff]), axis=1) kdesub = gaussian_kde(newriivals) kdedf = pd.DataFrame({"subsetname": aff+" vs "+unaff, "Affected": aff, "Unaffected": unaff, "Density": kdesub.evaluate(ind), "logRII": ind}) AvUsamplings.append(kdedf) AvUsamplings = pd.concat(AvUsamplings).reset_index(drop=True) # Calculate an Observed density using the original data newriivals = inretmat.apply(lambda row: calcRII(row, affset, unaffset), axis=1) kde = gaussian_kde(newriivals) obsdf = pd.DataFrame({"vclass": "Observed", "Density": kde.evaluate(ind), "logRII": ind}) rsamplings = com.convert_to_r_dataframe(AvUsamplings) # robsdf = com.convert_to_r_dataframe(obsdf) p = (ggplot2.ggplot(rsamplings) + ggplot2.aes_string(x="logRII", y="Density", group="factor(subsetname)") + ggplot2.geom_vline(xintercept=0, linetype="dashed") + ggplot2.geom_hline(yintercept=0, linetype="solid") + ggplot2.geom_line(ggplot2.aes_string(color="factor(Unaffected)")) + ggplot2.scale_y_continuous("Density") + ggplot2.scale_x_continuous("Log RII") + ggplot2.scale_colour_brewer("Unaffected", palette="Set1") + ggplot2.facet_wrap(robjects.Formula('~ Affected'), ncol=3) + ggplot2.theme(**sitefreqtheme) + ggplot2.theme(**{'legend.position': "right"})) # ggplot2.geom_line( ggplot2.aes_string(x="logRII",y="Density", # group="factor(vclass)") + # linetype="dashed", color="black", data=robsdf ) + # ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle = 45)}) + figname = os.path.join(figuredir, gfftype+"_"+dtype+"_logirr_AvU.pdf") print "Writing file:", figname grdevices.pdf(figname, width=10, height=8) p.plot() grdevices.dev_off()
def si_lam(overlaps): X_si_wEM = [(a.lam, a.fp) for a,b in overlaps] + [(b.lam, b.fp) for a,b in overlaps] X_si_lam = [(a.si, a.lam) for a,b in overlaps] + [(b.si, b.lam) for a,b in overlaps] X_lam_wEM = [(a.lam, a.wEM) for a,b in overlaps] + [(b.lam, b.wEM) for a,b in overlaps] X_pi_wEM = [(a.si, a.fp) for a,b in overlaps] + [(b.si, b.fp) for a,b in overlaps] F = plt.figure(figsize=(15,10)) ax1 = F.add_subplot(2,2,1) x,y = [math.log(x,10) for x,y in X_si_wEM],[y for x,y in X_si_wEM] x,y = [x for x,y in X_si_wEM],[y for x,y in X_si_wEM] xy = np.vstack([x,y]) z = gaussian_kde(xy)(xy) ax1.scatter(x, y, c=z, s=14, edgecolor='') ax1.set_xlabel("Variance in Loading") ax1.set_ylabel("Probability of Paused") ax1.grid() ax2 = F.add_subplot(2,2,2) x,y = [math.log(x,10) for x,y in X_si_lam],[math.log(y,10) for x,y in X_si_lam] x,y = [x for x,y in X_si_lam],[y for x,y in X_si_lam] xy = np.vstack([x,y]) z = gaussian_kde(xy)(xy) ax2.set_xlabel("Variance in Loading") ax2.set_ylabel("Length of Initiation") ax2.scatter(x, y, c=z, s=14, edgecolor='') ax2.grid() ax3 = F.add_subplot(2,2,3) x,y = [math.log(x,10) for x,y in X_lam_wEM],[y for x,y in X_lam_wEM] xy = np.vstack([x,y]) z = gaussian_kde(xy)(xy) ax3.set_xlabel("Length of Initiation") ax3.set_ylabel("Probability of Paused") ax3.scatter(x, y, c=z, s=14, edgecolor='') ax3.grid() ax4 = F.add_subplot(2,2,4) x,y = [x for x,y in X_pi_wEM],[y for x,y in X_pi_wEM] xy = np.vstack([x,y]) z = gaussian_kde(xy)(xy) ax4.set_xlabel("Strand Probability") ax4.set_ylabel("Probability of Paused") ax4.scatter(x, y, c=z, s=14, edgecolor='') ax4.grid() plt.show()
def plot_kmap(data, data_raw=True, as_partitions=None, data_label="", filename="", plot_annotation=True, annotation_params=None, title=None, title_loc="center", titlelabelsize=26, axlabelsize=22, textsize=16, annotationsize=13, tail_threshold=None, plot_legend=False, plot_scatter=True, scatter_ms=None, scatter_c='k', scatter_a=.5, scatter_m=r'.', plot_heatmap=True, colormap=plt.cm.Greys, plot_contour=False, plot_contour_lbls=False, max_val_exp=5): # Plot basic setup matplotlib.rcParams.update({'font.size': textsize}) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) if title != None: plt.title(title, loc=title_loc, fontdict={'fontsize': titlelabelsize}) ax.set_xlabel("Anonymity Set Size ($k$)") ax.set_ylabel("Num. Anonymity Sets at Size of $k$") plt.tick_params(axis='both', which='major', labelsize=axlabelsize) # Process data if data_raw: # Assumed that anonymity sets partition the dataset data_length = len(data) xy = Counter(Counter(data).values()) x = [x_ for x_ in sorted(xy.keys())] y = [xy[ass] for ass in sorted(xy.keys())] if as_partitions == None or as_partitions == True: z = [ass * xy[ass] for ass in sorted(xy.keys())] w = [ float(ass * xy[ass]) / data_length for ass in sorted(xy.keys()) ] else: z = [xy[ass] for ass in sorted(xy.keys())] w = [float(xy[ass]) / data_length for ass in sorted(xy.keys())] else: # Not assumed that anonymity sets partition the dataset (e.g., they could be overlapping) data_length = data[0] xy = data[1] x = [x_ for x_ in sorted(xy.keys())] y = [xy[ass] for ass in sorted(xy.keys())] if as_partitions == None or as_partitions == False: z = [xy[ass] for ass in sorted(xy.keys())] w = [float(xy[ass]) / data_length for ass in sorted(xy.keys())] else: z = [ass * xy[ass] for ass in sorted(xy.keys())] w = [ float(ass * xy[ass]) / data_length for ass in sorted(xy.keys()) ] if plot_heatmap or plot_contour: # Emphasize heavy spots for the contour (but visualize only one for each) x_ = [] y_ = [] z_ = [] for ix in range(len(z)): for i in range((z[ix])): if i == 0: if scatter_ms == None: z_.append(float(10000 * z[ix]) / data_length) else: z_.append(scatter_ms) else: z_.append(0.0) x_.append(math.log(x[ix], 10)) y_.append(math.log(y[ix], 10)) # Heatmap calculation X, Y = np.mgrid[-0.5:5:100j, -0.5:5:100j] positions = np.vstack([X.ravel(), Y.ravel()]) values = np.vstack([x_, y_]) kernel = gaussian_kde(values) Z = np.reshape(kernel(positions).T, X.shape) Z = np.sqrt(np.sqrt(Z)) # Strengthen low-weighted regions # Plot heatmap if plot_heatmap: plt.contourf(X, Y, Z, 10, cmap=colormap, alpha=.5) # Plot contour if plot_contour: cs = plt.contour(X, Y, Z, 10, cmap=colormap, alpha=.5) if plot_contour_lbls: plt.clabel(cs, inline=1, fontsize=int(textsize / 2)) if plot_scatter: # Scatter points plt.scatter([math.log(_, 10) for _ in x], [math.log(_, 10) for _ in y], s=[10**4 * _ for _ in w], alpha=scatter_a, c=scatter_c, marker=scatter_m, label=data_label) # alpha=.5, if plot_legend: # Legend lgnd = plt.legend(loc="upper right", fontsize=textsize) # , numpoints=1 lgnd.legendHandles[0]._sizes = [30] # Select groups of datapoints if isinstance(plot_annotation, list): grps = [[] for _ in range(len(plot_annotation))] weights = [0.0 for _ in range(len(plot_annotation))] for ix in range(len(x)): for gix in range(len(plot_annotation)): grp = plot_annotation[gix] if x[ix] >= min(grp) and x[ix] <= max(grp): grps[gix].append([x[ix], y[ix]]) weights[gix] += w[ix] for gix, grp in enumerate(grps): if len(grp) == 0: continue annotation_radius = .1 if isinstance(annotation_params, dict) and 'radius' in annotation_params: if isinstance(annotation_params['radius'], list): annotation_radius = annotation_params['radius'][gix] else: annotation_radius = annotation_params['radius'] annotation_distance = 1.0 if isinstance(annotation_params, dict) and 'distance' in annotation_params: if isinstance(annotation_params['radius'], list): annotation_distance = annotation_params['distance'][gix] else: annotation_distance = annotation_params['distance'] annotation_linestyle = dict(color='r', width=2, style='-') if isinstance(annotation_params, dict) and 'linestyle' in annotation_params: annotation_linestyle = annotation_params['linestyle'] pts = [[math.log(pt[0], 10), math.log(pt[1], 10)] for pt in grp] c, r = selectpoints(ax, pts, radius=annotation_radius, ec=annotation_linestyle['color'], lw=annotation_linestyle['width'], ls=annotation_linestyle['style'], fill=False) annotation_shift_vector = [r * annotation_distance, 0.0] if isinstance(annotation_params, dict) and 'location' in annotation_params: if isinstance(annotation_params['location'], list): if annotation_params['location'][gix] == 'left': annotation_shift_vector = [ -r * annotation_distance, 0.0 ] elif annotation_params['location'][gix] == 'top': annotation_shift_vector = [ 0.0, r * annotation_distance ] elif annotation_params['location'][gix] == 'bottom': annotation_shift_vector = [ 0.0, -r * annotation_distance ] else: if annotation_params['location'] == 'left': annotation_shift_vector = [ -r * annotation_distance, 0.0 ] elif annotation_params['location'] == 'top': annotation_shift_vector = [ 0.0, r * annotation_distance ] elif annotation_params['location'] == 'bottom': annotation_shift_vector = [ 0.0, -r * annotation_distance ] plt.text(c[0] + annotation_shift_vector[0], c[1] + annotation_shift_vector[1], "%.2f %%" % (weights[gix] * 100)) # Add annotations for minimum and maximum anonymity sets elif isinstance(plot_annotation, bool) and plot_annotation: add_annotations(ax, x, y, z, annotationsize, tail_threshold) # Setup XY axes maxval = max_val_exp plt.ylim(-0.5, maxval) plt.xlim(-0.5, maxval) ticks = range(maxval + 1) lbls = ["${10}^{%d}$" % v for v in range(maxval)] ax.set_xticks(ticks) ax.set_xticklabels(lbls) ax.set_yticks(ticks) ax.set_yticklabels(lbls) # Save file plt.tight_layout() if '.' in filename: plt.savefig(filename) else: plt.savefig(filename + '.pdf') plt.savefig(filename + '.png')
def bland_altman_plots(df, rep_stats=None, els=['Mg', 'Sr', 'Ba', 'Al', 'Mn']): # get corresponding analyte and ratio names As = [] Rs = [] analytes = [c for c in df.columns if ('_r' not in c) and ('_t' not in c)] ratios = [c for c in df.columns if ('_r' in c)] for e in els: if e == 'Sr': As.append('Sr88') elif e == 'Mg': As.append('Mg24') else: As.append([a for a in analytes if e in a][0]) Rs.append([r for r in ratios if e in r][0][:-2]) fig, axs = plt.subplots(len(els), 3, figsize=(6.5, len(els) * 2)) for i, (e, a) in enumerate(zip(Rs, As)): if a == 'Ba138': m = 1e3 u = '$\mu$mol/mol' else: m = 1 u = 'mmol/mol' tax, lax, hax = axs[i] c=element_colour(a) x = df.loc[:, e + '_r'].values * m yt = df.loc[:, e + '_t'].values * m yl = df.loc[:, a].values * m # draw Bland-Altman plots if rep_stats is None: CI = None else: CI = rep_stats[e][0] bland_altman(x, yt, interval=.75, indep_conf=CI, ax=tax, c=c) bland_altman(x, yl, interval=.75, indep_conf=CI, ax=lax, c=c) xlim = (min(tax.get_xlim()[0], lax.get_xlim()[0]), max(tax.get_xlim()[1], lax.get_xlim()[1])) tax.set_xlim(xlim) lax.set_xlim(xlim) ylim = rangecalc(tax.get_ylim(), lax.get_ylim()) # draw residual PDFs # calculate residuals rt = yt - x rl = yl - x # remove NaNs rt = rt[~np.isnan(rt)] rl = rl[~np.isnan(rl)] # calculate bins bins = np.linspace(*ylim, 100) # calculate KDEs kdt = stats.gaussian_kde(rt, .4) kdl = stats.gaussian_kde(rl, .4) # draw KDEs hax.fill_betweenx(bins, kdl(bins), facecolor=element_colour(a), alpha=0.8, edgecolor='k', lw=0.75, label='LAtools', zorder=-1) hax.fill_betweenx(bins, kdt(bins), facecolor=element_colour(a), alpha=0.4, edgecolor='k', lw=0.75, label='Manual', zorder=1) # limits and horizontal line hax.set_xlim([0, hax.get_xlim()[-1]]) hax.axhline(0, ls='dashed', c='k', alpha=0.6, zorder=-1) for ax in axs[i]: ax.set_ylim(ylim) if ax.is_first_col(): ax.set_ylabel(e + ' ('+ u + ')\nResidual') else: ax.set_ylabel('') ax.set_yticklabels([]) if ax.is_last_row(): tax.set_xlabel('Mean') lax.set_xlabel('Mean') hax.set_xlabel('Residual Density') hax.legend() else: ax.set_xlabel('') if ax.is_first_row(): tax.set_title('Manual Test User', loc='left') lax.set_title('LAtools Test User', loc='left') hax.set_title('Residuals', loc='left') fig.tight_layout() return fig, axs
markers = ['v', '^', 'd', '_', '|', 's', '8', 's', 'p', '*'] filname = 'flow_samples_all.txt' with open(filname, 'rb') as f: data = pickle.load(f) x = [] y = [] # print(len(data)) for j in range(0, len(data[1:50000])): x.append(data[j][0][0]) y.append(data[j][0][1]) # print(j) # print(data) # len(y) xy = np.vstack([x, y]) z = (gaussian_kde(xy)(xy)) # x_m = sum(x) / float(len(x)) # y_m = sum(y) / float(len(x)) ax.scatter(x, y, c=z, s=10, edgecolor='') print("main_done") for i in range(0, 10): filname = 'flow_samples_' + str(i) + '.txt' with open(filname, 'r') as f: data = pickle.loads(f.read()) x = [] y = [] # print(len(data)) for j in range(0, len(data[1:5000])): x.append(data[j][0][0]) y.append(data[j][0][1])
def __init__(self, param_list, values, bw_method=None): self.param_list = param_list self.pdf_estimate = gaussian_kde(values, bw_method=bw_method)
def comparison_plots(df, els=['Mg', 'Sr', 'Ba', 'Al', 'Mn']): """ Function for plotting Test User and LAtools data comparison. Parameters ---------- df : pandas.DataFrame A dataframe containing reference ('X/Ca_r'), test user ('X/Ca_t') and LAtools ('X123') data. els : list list of elements (names only) to plot. """ # get corresponding analyte and ratio names As = [] Rs = [] analytes = [c for c in df.columns if ('_r' not in c) and ('_t' not in c)] ratios = [c for c in df.columns if ('_r' in c)] for e in els: if e == 'Sr': As.append('Sr88') elif e == 'Mg': As.append('Mg24') else: As.append([a for a in analytes if e in a][0]) Rs.append([r for r in ratios if e in r][0][:-2]) fig, axs = plt.subplots(len(els), 3, figsize=(6.5, len(els) * 2)) for i, (e, a) in enumerate(zip(Rs, As)): if a == 'Ba138': m = 1e3 u = '$\mu$mol/mol' else: m = 1 u = 'mmol/mol' c = element_colour(a) tax, lax, hax = axs[i] x = df.loc[:, e + '_r'].values * m yt = df.loc[:, e + '_t'].values * m yl = df.loc[:, a].values * m # calculate residuals rt = yt - x rl = yl - x # plot residuals tax.scatter(x, yt, c=c, s=15, lw=0.5, edgecolor='k', alpha=0.5) lax.scatter(x, yl, c=c, s=15, lw=0.5, edgecolor='k', alpha=0.5) # plot PDFs rt = rt[~np.isnan(rt)] rl = rl[~np.isnan(rl)] lims = np.percentile(np.hstack([rt, rl]), [99, 1]) lims += np.ptp(lims) * np.array((-1.25, 1.25)) bins = np.linspace(*lims, 100) kdt = stats.gaussian_kde(rt, .4) kdl = stats.gaussian_kde(rl, .4) hax.fill_between(bins, kdl(bins), facecolor=c, alpha=0.7, edgecolor='k', lw=0.5, label='LAtools') hax.fill_between(bins, kdt(bins), facecolor=c, alpha=0.4, edgecolor='k', lw=0.5, label='Test User') hax.set_ylim([0, hax.get_ylim()[-1]]) hax.set_xlim(lims) hax.axvline(0, c='k', ls='dashed', alpha=0.6) # hax.set_yticklabels([]) hax.set_ylabel('Density') # axis labels, annotations and limits tax.set_ylabel(e + ' ('+ u + ')') tax.text(.02,.98,fmt_RSS(rt), fontsize=8, ha='left', va='top', transform=tax.transAxes) lax.text(.02,.98,fmt_RSS(rl), fontsize=8, ha='left', va='top', transform=lax.transAxes) xlim = np.percentile(x[~np.isnan(x)], [0, 98]) for ax in [tax, lax]: ax.set_xlim(xlim) ax.set_ylim(xlim) ax.plot(xlim, xlim, c='k', ls='dashed', alpha=0.6) for ax in axs[i]: if ax.is_last_row(): hax.set_xlabel('Residual') tax.set_xlabel('Reference User') lax.set_xlabel('Reference User') hax.legend(fontsize=8) if ax.is_first_row(): tax.set_title('Manual Test User', loc='left') lax.set_title('LAtools Test User', loc='left') fig.tight_layout() return fig, axs
StopCond = 0 Stop = 20 NormalIDX = 20 TargetData = TrainECG[TrainECG.keys()[NormalIDX]] for idx, key in enumerate(TestLabel): if TestLabel[key] == "V": TargetData2 = TestECG[key] StopCond += 1 if StopCond == Stop: break Density_V = gaussian_kde(TargetData2) Domain_V = np.linspace(-max(TargetData2), max(TargetData2), 1000) Density = gaussian_kde(TargetData) Domain = np.linspace(-max(TargetData), max(TargetData), 1000) plt.figure() plt.title("V") plt.plot(Domain_V, Density_V(Domain_V)) plt.grid() plt.figure() plt.title("N") plt.plot(Domain, Density(Domain)) plt.grid() plt.show()
print(avg_v) print(avg_celldist * avg_v) fig = plt.figure(figsize=(18, 6)) fig.suptitle("Material Project Dataset") ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=4) w = 0.2 xint = np.arange(7) ax1.bar(xint - w, avg_lv[:, 0], width=w, color='b', align='center') ax1.bar(xint, avg_lv[:, 1], width=w, color='r', align='center') ax1.bar(xint + w, avg_lv[:, 2], width=w, color='g', align='center') ax1.set_xticks(xint) ax1.set_xticklabels( ('Tri-', 'Mono-', "Ortho-", "Tetra-", "Trig-", "Hexa-", "Cubic")) plt.title("Average min/median/max lattice vector") ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=4) xmax = rmax_list.max() for i in range(7): indices = np.where(type_list == i) ds = rmax_list[indices] density = gaussian_kde(ds) xs = np.linspace(0, xmax, 200) density.covariance_factor = lambda: .25 density._compute_covariance() plt.plot(xs, density(xs)) plt.legend(('Tri-', 'Mono-', "Ortho-", "Tetra-", "Trig-", "Hexa-", "Cubic")) plt.title("Gaussian density of rmax distribution") plt.show()
tempData = yrData.loc[yrData.Month == m] tempData = tempData.loc[tempData.Day == d] col = 'yr' + str(yr) print(col) dfTempDay[col] = tempData.Temperature.values dfGHIDay[col] = tempData.GHI.values #%% from scipy import stats as st import numpy as np sampleKDE_F = dfTempDay.iloc[0] my_kde = st.gaussian_kde(sampleKDE_F) sampleKDE_F = my_kde.resample(1)[0][0] #%% # ##%% Parse Date # #tempDayData = pd.DataFrame(0, index=np.arange(24), columns=col) # ##yr = 2007; #month = 7; #day = 1; #i = 0; # #for key, yrData in data.items(): # j = 'yr'+str(key)
spread.append(p[t] / r[t] * 100) leverage.append(a[t] / (p[t] * np.average(n_equity))) Spread.append(spread) Leverage.append(leverage) ############## x = [] for j in range(iterations - cut_time - 1): for i in range(sim): x.append(j) y_spread = [] y_leverage = [] for i in range(sim): for j in range(iterations - cut_time - 1): y_spread.append(Spread[i][j]) y_leverage.append(Leverage[i][j]) z_spread = gaussian_kde(y_spread)(y_spread) z_leverage = gaussian_kde(y_leverage)(y_leverage) idx_spread = z_spread.argsort() idx_leverage = z_leverage.argsort() x_spread = copy.deepcopy(x) x_leverage = copy.deepcopy(x) # x_spread, y_spread, z_spread = x_spread[idx_spread], y_spread[idx_spread], z_spread[idx_spread] # x_leverage, y_leverage, z_leverage = x_leverage[idx_leverage], y_leverage[idx_leverage], z_leverage[idx_leverage] ############## fig1, ax1 = plt.subplots() cax1 = ax1.scatter(x_spread, y_spread, c=z_spread, s=30, edgecolor='') ax1.set_title('Price Spread') fig1.colorbar(cax1) fig2, ax2 = plt.subplots() cax2 = ax2.scatter(x_leverage, y_leverage, c=z_leverage, s=30, edgecolor='') ax2.set_title('Leverage')
def summary_plot( shap_values, features=None, feature_names=None, max_display=None, plot_type=None, color=None, axis_color="#333333", title=None, alpha=1, show=True, get_png=False, sort=True, color_bar=True, plot_size="auto", layered_violin_max_num_bins=20, class_names=None, class_inds=None, color_bar_label=labels["FEATURE_VALUE"], # depreciated auto_size_plot=None, ): """Create a SHAP summary plot, colored by feature values when they are provided. Parameters ---------- shap_values : numpy.array For single output explanations this is a matrix of SHAP values (# samples x # features). For multi-output explanations this is a list of such matrices of SHAP values. features : numpy.array or pandas.DataFrame or list Matrix of feature values (# samples x # features) or a feature_names list as shorthand feature_names : list Names of the features (length # features) max_display : int How many top features to include in the plot (default is 20, or 7 for interaction plots) plot_type : "dot" (default for single output), "bar" (default for multi-output), "violin", or "compact_dot". What type of summary plot to produce. Note that "compact_dot" is only used for SHAP interaction values. plot_size : "auto" (default), float, (float, float), or None What size to make the plot. By default the size is auto-scaled based on the number of features that are being displayed. Passing a single float will cause each row to be that many inches high. Passing a pair of floats will scale the plot by that number of inches. If None is passed then the size of the current figure will be left unchanged. """ # deprecation warnings if auto_size_plot is not None: warnings.warn( "auto_size_plot=False is deprecated and is now ignored! Use plot_size=None instead." ) multi_class = False if isinstance(shap_values, list): multi_class = True if plot_type is None: plot_type = "bar" # default for multi-output explanations assert plot_type == "bar", "Only plot_type = 'bar' is supported for multi-output explanations!" else: if plot_type is None: plot_type = "dot" # default for single output explanations assert len( shap_values.shape ) != 1, "Summary plots need a matrix of shap_values, not a vector." # default color: if color is None: if plot_type == 'layered_violin': color = "coolwarm" elif multi_class: color = lambda i: colors.red_blue_circle(i / len(shap_values)) else: color = colors.blue_rgb # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = features.columns features = features.values elif isinstance(features, list): if feature_names is None: feature_names = features features = None elif (features is not None) and len( features.shape) == 1 and feature_names is None: feature_names = features features = None num_features = (shap_values[0].shape[1] if multi_class else shap_values.shape[1]) if features is not None: shape_msg = "The shape of the shap_values matrix does not match the shape of the " \ "provided data matrix." if num_features - 1 == features.shape[1]: assert False, shape_msg + " Perhaps the extra column in the shap_values matrix is the " \ "constant offset? Of so just pass shap_values[:,:-1]." else: assert num_features == features.shape[1], shape_msg if feature_names is None: feature_names = np.array( [labels['FEATURE'] % str(i) for i in range(num_features)]) # plotting SHAP interaction values if not multi_class and len(shap_values.shape) == 3: if plot_type == "compact_dot": new_shap_values = shap_values.reshape(shap_values.shape[0], -1) new_features = np.tile(features, (1, 1, features.shape[1])).reshape( features.shape[0], -1) new_feature_names = [] for c1 in feature_names: for c2 in feature_names: if c1 == c2: new_feature_names.append(c1) else: new_feature_names.append(c1 + "* - " + c2) return summary_plot(new_shap_values, new_features, new_feature_names, max_display=max_display, plot_type="dot", color=color, axis_color=axis_color, title=title, alpha=alpha, show=show, sort=sort, color_bar=color_bar, plot_size=plot_size, class_names=class_names, color_bar_label="*" + color_bar_label) if max_display is None: max_display = 7 else: max_display = min(len(feature_names), max_display) sort_inds = np.argsort(-np.abs(shap_values.sum(1)).sum(0)) # get plotting limits delta = 1.0 / (shap_values.shape[1]**2) slow = np.nanpercentile(shap_values, delta) shigh = np.nanpercentile(shap_values, 100 - delta) v = max(abs(slow), abs(shigh)) slow = -v shigh = v pl.figure(figsize=(1.5 * max_display + 1, 0.8 * max_display + 1)) pl.subplot(1, max_display, 1) proj_shap_values = shap_values[:, sort_inds[0], sort_inds] proj_shap_values[:, 1:] *= 2 # because off diag effects are split in half summary_plot(proj_shap_values, features[:, sort_inds] if features is not None else None, feature_names=feature_names[sort_inds], sort=False, show=False, get_png=get_png, color_bar=False, plot_size=None, max_display=max_display) pl.xlim((slow, shigh)) pl.xlabel("") title_length_limit = 11 pl.title(shorten_text(feature_names[sort_inds[0]], title_length_limit)) for i in range(1, min(len(sort_inds), max_display)): ind = sort_inds[i] pl.subplot(1, max_display, i + 1) proj_shap_values = shap_values[:, ind, sort_inds] proj_shap_values *= 2 proj_shap_values[:, i] /= 2 # because only off diag effects are split in half summary_plot(proj_shap_values, features[:, sort_inds] if features is not None else None, sort=False, get_png=get_png, feature_names=["" for i in range(len(feature_names))], show=False, color_bar=False, plot_size=None, max_display=max_display) pl.xlim((slow, shigh)) pl.xlabel("") if i == min(len(sort_inds), max_display) // 2: pl.xlabel(labels['INTERACTION_VALUE']) pl.title(shorten_text(feature_names[ind], title_length_limit)) pl.tight_layout(pad=0, w_pad=0, h_pad=0.0) pl.subplots_adjust(hspace=0, wspace=0.1) if show: pl.show() return if max_display is None: max_display = 20 if sort: # order features by the sum of their effect magnitudes if multi_class: feature_order = np.argsort( np.sum(np.mean(np.abs(shap_values), axis=1), axis=0)) else: feature_order = np.argsort(np.sum(np.abs(shap_values), axis=0)) feature_order = feature_order[-min(max_display, len(feature_order)):] else: feature_order = np.flip(np.arange(min(max_display, num_features)), 0) row_height = 0.4 if plot_size == "auto": pl.gcf().set_size_inches(8, len(feature_order) * row_height + 1.5) elif type(plot_size) in (list, tuple): pl.gcf().set_size_inches(plot_size[0], plot_size[1]) elif plot_size is not None: pl.gcf().set_size_inches(8, len(feature_order) * plot_size + 1.5) pl.axvline(x=0, color="#999999", zorder=-1) if plot_type == "dot": for pos, i in enumerate(feature_order): pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1) shaps = shap_values[:, i] values = None if features is None else features[:, i] inds = np.arange(len(shaps)) np.random.shuffle(inds) if values is not None: values = values[inds] shaps = shaps[inds] colored_feature = True try: values = np.array( values, dtype=np.float64) # make sure this can be numeric except: colored_feature = False N = len(shaps) # hspacing = (np.max(shaps) - np.min(shaps)) / 200 # curr_bin = [] nbins = 100 quant = np.round(nbins * (shaps - np.min(shaps)) / (np.max(shaps) - np.min(shaps) + 1e-8)) inds = np.argsort(quant + np.random.randn(N) * 1e-6) layer = 0 last_bin = -1 ys = np.zeros(N) for ind in inds: if quant[ind] != last_bin: layer = 0 ys[ind] = np.ceil(layer / 2) * ((layer % 2) * 2 - 1) layer += 1 last_bin = quant[ind] ys *= 0.9 * (row_height / np.max(ys + 1)) if features is not None and colored_feature: # trim the color range, but prevent the color range from collapsing vmin = np.nanpercentile(values, 5) vmax = np.nanpercentile(values, 95) if vmin == vmax: vmin = np.nanpercentile(values, 1) vmax = np.nanpercentile(values, 99) if vmin == vmax: vmin = np.min(values) vmax = np.max(values) if vmin > vmax: # fixes rare numerical precision issues vmin = vmax assert features.shape[0] == len( shaps ), "Feature and SHAP matrices must have the same number of rows!" # plot the nan values in the interaction feature as grey nan_mask = np.isnan(values) pl.scatter(shaps[nan_mask], pos + ys[nan_mask], color="#777777", vmin=vmin, vmax=vmax, s=16, alpha=alpha, linewidth=0, zorder=3, rasterized=len(shaps) > 500) # plot the non-nan values colored by the trimmed feature value cvals = values[np.invert(nan_mask)].astype(np.float64) cvals_imp = cvals.copy() cvals_imp[np.isnan(cvals)] = (vmin + vmax) / 2.0 cvals[cvals_imp > vmax] = vmax cvals[cvals_imp < vmin] = vmin pl.scatter(shaps[np.invert(nan_mask)], pos + ys[np.invert(nan_mask)], cmap=colors.red_blue, vmin=vmin, vmax=vmax, s=16, c=cvals, alpha=alpha, linewidth=0, zorder=3, rasterized=len(shaps) > 500) else: pl.scatter(shaps, pos + ys, s=16, alpha=alpha, linewidth=0, zorder=3, color=color if colored_feature else "#777777", rasterized=len(shaps) > 500) elif plot_type == "violin": for pos, i in enumerate(feature_order): pl.axhline(y=pos, color="#cccccc", lw=0.5, dashes=(1, 5), zorder=-1) if features is not None: global_low = np.nanpercentile( shap_values[:, :len(feature_names)].flatten(), 1) global_high = np.nanpercentile( shap_values[:, :len(feature_names)].flatten(), 99) for pos, i in enumerate(feature_order): shaps = shap_values[:, i] shap_min, shap_max = np.min(shaps), np.max(shaps) rng = shap_max - shap_min xs = np.linspace( np.min(shaps) - rng * 0.2, np.max(shaps) + rng * 0.2, 100) if np.std(shaps) < (global_high - global_low) / 100: ds = gaussian_kde(shaps + np.random.randn(len(shaps)) * (global_high - global_low) / 100)(xs) else: ds = gaussian_kde(shaps)(xs) ds /= np.max(ds) * 3 values = features[:, i] window_size = max(10, len(values) // 20) smooth_values = np.zeros(len(xs) - 1) sort_inds = np.argsort(shaps) trailing_pos = 0 leading_pos = 0 running_sum = 0 back_fill = 0 for j in range(len(xs) - 1): while leading_pos < len(shaps) and xs[j] >= shaps[ sort_inds[leading_pos]]: running_sum += values[sort_inds[leading_pos]] leading_pos += 1 if leading_pos - trailing_pos > 20: running_sum -= values[sort_inds[trailing_pos]] trailing_pos += 1 if leading_pos - trailing_pos > 0: smooth_values[j] = running_sum / (leading_pos - trailing_pos) for k in range(back_fill): smooth_values[j - k - 1] = smooth_values[j] else: back_fill += 1 vmin = np.nanpercentile(values, 5) vmax = np.nanpercentile(values, 95) if vmin == vmax: vmin = np.nanpercentile(values, 1) vmax = np.nanpercentile(values, 99) if vmin == vmax: vmin = np.min(values) vmax = np.max(values) # plot the nan values in the interaction feature as grey nan_mask = np.isnan(values) pl.scatter(shaps[nan_mask], np.ones(shap_values[nan_mask].shape[0]) * pos, color="#777777", vmin=vmin, vmax=vmax, s=9, alpha=alpha, linewidth=0, zorder=1) # plot the non-nan values colored by the trimmed feature value cvals = values[np.invert(nan_mask)].astype(np.float64) cvals_imp = cvals.copy() cvals_imp[np.isnan(cvals)] = (vmin + vmax) / 2.0 cvals[cvals_imp > vmax] = vmax cvals[cvals_imp < vmin] = vmin pl.scatter(shaps[np.invert(nan_mask)], np.ones(shap_values[np.invert(nan_mask)].shape[0]) * pos, cmap=colors.red_blue, vmin=vmin, vmax=vmax, s=9, c=cvals, alpha=alpha, linewidth=0, zorder=1) # smooth_values -= nxp.nanpercentile(smooth_values, 5) # smooth_values /= np.nanpercentile(smooth_values, 95) smooth_values -= vmin if vmax - vmin > 0: smooth_values /= vmax - vmin for i in range(len(xs) - 1): if ds[i] > 0.05 or ds[i + 1] > 0.05: pl.fill_between( [xs[i], xs[i + 1]], [pos + ds[i], pos + ds[i + 1]], [pos - ds[i], pos - ds[i + 1]], color=colors.red_blue_no_bounds(smooth_values[i]), zorder=2) else: parts = pl.violinplot(shap_values[:, feature_order], range(len(feature_order)), points=200, vert=False, widths=0.7, showmeans=False, showextrema=False, showmedians=False) for pc in parts['bodies']: pc.set_facecolor(color) pc.set_edgecolor('none') pc.set_alpha(alpha) elif plot_type == "layered_violin": # courtesy of @kodonnell num_x_points = 200 bins = np.linspace( 0, features.shape[0], layered_violin_max_num_bins + 1 ).round(0).astype( 'int') # the indices of the feature data corresponding to each bin shap_min, shap_max = np.min(shap_values), np.max(shap_values) x_points = np.linspace(shap_min, shap_max, num_x_points) # loop through each feature and plot: for pos, ind in enumerate(feature_order): # decide how to handle: if #unique < layered_violin_max_num_bins then split by unique value, otherwise use bins/percentiles. # to keep simpler code, in the case of uniques, we just adjust the bins to align with the unique counts. feature = features[:, ind] unique, counts = np.unique(feature, return_counts=True) if unique.shape[0] <= layered_violin_max_num_bins: order = np.argsort(unique) thesebins = np.cumsum(counts[order]) thesebins = np.insert(thesebins, 0, 0) else: thesebins = bins nbins = thesebins.shape[0] - 1 # order the feature data so we can apply percentiling order = np.argsort(feature) # x axis is located at y0 = pos, with pos being there for offset y0 = np.ones(num_x_points) * pos # calculate kdes: ys = np.zeros((nbins, num_x_points)) for i in range(nbins): # get shap values in this bin: shaps = shap_values[order[thesebins[i]:thesebins[i + 1]], ind] # if there's only one element, then we can't if shaps.shape[0] == 1: warnings.warn( "not enough data in bin #%d for feature %s, so it'll be ignored. Try increasing the number of records to plot." % (i, feature_names[ind])) # to ignore it, just set it to the previous y-values (so the area between them will be zero). Not ys is already 0, so there's # nothing to do if i == 0 if i > 0: ys[i, :] = ys[i - 1, :] continue # save kde of them: note that we add a tiny bit of gaussian noise to avoid singular matrix errors ys[i, :] = gaussian_kde(shaps + np.random.normal( loc=0, scale=0.001, size=shaps.shape[0]))(x_points) # scale it up so that the 'size' of each y represents the size of the bin. For continuous data this will # do nothing, but when we've gone with the unqique option, this will matter - e.g. if 99% are male and 1% # female, we want the 1% to appear a lot smaller. size = thesebins[i + 1] - thesebins[i] bin_size_if_even = features.shape[0] / nbins relative_bin_size = size / bin_size_if_even ys[i, :] *= relative_bin_size # now plot 'em. We don't plot the individual strips, as this can leave whitespace between them. # instead, we plot the full kde, then remove outer strip and plot over it, etc., to ensure no # whitespace ys = np.cumsum(ys, axis=0) width = 0.8 scale = ys.max( ) * 2 / width # 2 is here as we plot both sides of x axis for i in range(nbins - 1, -1, -1): y = ys[i, :] / scale c = pl.get_cmap(color)( i / (nbins - 1) ) if color in pl.cm.datad else color # if color is a cmap, use it, otherwise use a color pl.fill_between(x_points, pos - y, pos + y, facecolor=c) pl.xlim(shap_min, shap_max) elif not multi_class and plot_type == "bar": feature_inds = feature_order[:max_display] y_pos = np.arange(len(feature_inds)) global_shap_values = np.abs(shap_values).mean(0) pl.barh(y_pos, global_shap_values[feature_inds], 0.7, align='center', color=color) pl.yticks(y_pos, fontsize=13) pl.gca().set_yticklabels([feature_names[i] for i in feature_inds]) elif multi_class and plot_type == "bar": if class_names is None: class_names = ["Class " + str(i) for i in range(len(shap_values))] feature_inds = feature_order[:max_display] y_pos = np.arange(len(feature_inds)) left_pos = np.zeros(len(feature_inds)) if class_inds is None: class_inds = np.argsort([ -np.abs(shap_values[i]).mean() for i in range(len(shap_values)) ]) elif class_inds == "original": class_inds = range(len(shap_values)) for i, ind in enumerate(class_inds): global_shap_values = np.abs(shap_values[ind]).mean(0) pl.barh(y_pos, global_shap_values[feature_inds], 0.7, left=left_pos, align='center', color=color(i), label=class_names[ind]) left_pos += global_shap_values[feature_inds] pl.yticks(y_pos, fontsize=13) pl.gca().set_yticklabels([feature_names[i] for i in feature_inds]) pl.legend(frameon=False, fontsize=12) # draw the color bar if color_bar and features is not None and plot_type != "bar" and \ (plot_type != "layered_violin" or color in pl.cm.datad): import matplotlib.cm as cm m = cm.ScalarMappable( cmap=colors.red_blue if plot_type != "layered_violin" else pl. get_cmap(color)) m.set_array([0, 1]) cb = pl.colorbar(m, ticks=[0, 1], aspect=1000) cb.set_ticklabels( [labels['FEATURE_VALUE_LOW'], labels['FEATURE_VALUE_HIGH']]) cb.set_label(color_bar_label, size=12, labelpad=0) cb.ax.tick_params(labelsize=11, length=0) cb.set_alpha(1) cb.outline.set_visible(False) bbox = cb.ax.get_window_extent().transformed( pl.gcf().dpi_scale_trans.inverted()) cb.ax.set_aspect((bbox.height - 0.9) * 20) # cb.draw_all() pl.gca().xaxis.set_ticks_position('bottom') pl.gca().yaxis.set_ticks_position('none') pl.gca().spines['right'].set_visible(False) pl.gca().spines['top'].set_visible(False) pl.gca().spines['left'].set_visible(False) pl.gca().tick_params(color=axis_color, labelcolor=axis_color) pl.yticks(range(len(feature_order)), [feature_names[i] for i in feature_order], fontsize=13) if plot_type != "bar": pl.gca().tick_params('y', length=20, width=0.5, which='major') pl.gca().tick_params('x', labelsize=11) pl.ylim(-1, len(feature_order)) if plot_type == "bar": pl.xlabel(labels['GLOBAL_VALUE'], fontsize=13) else: pl.xlabel(labels['VALUE'], fontsize=13) if show: pl.show() if get_png: file = BytesIO() pl.savefig(file, format='png', bbox_inches="tight") return file
import matplotlib.pyplot as plt import numpy as np from scipy import stats x1 = np.array([-7, -5, 1, 4, 5], dtype=float) x_eval = np.linspace(-10, 10, num=200) kde1 = stats.gaussian_kde(x1) kde2 = stats.gaussian_kde(x1, bw_method='silverman') def my_kde_bandwidth(obj, fac=1. / 5): """We use Scott's Rule, multiplied by a constant factor.""" return np.power(obj.n, -1. / (obj.d + 4)) * fac fig = plt.figure() ax = fig.add_subplot(111) ax.plot(x1, np.zeros(x1.shape), 'b+', ms=20) # rug plot kde3 = stats.gaussian_kde(x1, bw_method=my_kde_bandwidth) ax.plot(x_eval, kde3(x_eval), 'g-', label="With smaller BW") plt.show()
def twod_kde(x,y): X, Y = np.mgrid[x.min()*0.9:x.max()*1.1:100j, y.min()*0.9:y.max()*1.1:100j] positions = np.vstack([X.ravel(), Y.ravel()]) values = np.vstack([x, y]) kernel = gaussian_kde(values) return X, Y, np.reshape(kernel(positions).T, X.shape)
def plot_errors(model, X_test, y_test, scaler_l): # plot MAPEs predicted_labels = _back_scaling(model.predict(X_test), scaler_l.data_min_, scaler_l.data_max_) true_labels = _back_scaling(y_test, scaler_l.data_min_, scaler_l.data_max_) # compute mape for each value res = {} means = {} stdvs = {} for index, value in enumerate(predicted_labels[:, opt.window - 1, :]): if int(true_labels[index, opt.window - 1, 0]) in res: res[int(true_labels[index, opt.window - 1, 0])].append( np.abs(predicted_labels[index, opt.window - 1, 0] - true_labels[index, opt.window - 1, 0]) / true_labels[index, opt.window - 1, 0]) else: res[int(true_labels[index, opt.window - 1, 0])] = [ np.abs(predicted_labels[index, opt.window - 1, 0] - true_labels[index, opt.window - 1, 0]) / true_labels[index, opt.window - 1, 0] ] for i, j in res.items(): means[i] = np.sum(j) / len(j) * 100 stdvs[i] = np.std(j) * 100 lists_means = sorted(means.items()) lists_stdvs = sorted(stdvs.items()) x1, y1 = zip(*lists_means) x2, y2 = zip(*lists_stdvs) density = stats.gaussian_kde(true_labels[:, opt.window - 1, :].flatten()) s = np.sum(true_labels[:, opt.window - 1, :].flatten()) fig = plt.figure(figsize=(3, 2)) ax = fig.add_subplot(111) ax.set_xlabel('# Flows / Sequence') lns1 = ax.plot(x1, density(x1) * s, label='Counts') ax2 = ax.twinx() lns2 = ax2.plot(x1, y1, color='tab:red', label='MAPE (%)', linestyle='--', linewidth=2, alpha=0.7) lns2b = ax2.fill_between(x1, np.array(y1) - np.array(y2), np.array(y1) + np.array(y2), color='r', alpha=0.2) lns = lns1 + lns2 labs = [l.get_label() for l in lns] ax.legend(lns, labs, loc=0) ax.grid(alpha=0.4) ax.set_xlabel('# Flows / Sequence') ax.set_ylabel('Counts') ax2.set_ylabel('MAPE (%)') ax2.set_ylim(0, 100) plt.savefig("errors_{}_{}.pdf".format( opt.window, datetime.now().strftime("%Y%m%d-%H%M%S")), bbox_inches='tight')
xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j] t1 = time.time() print "Grid mesh: ", t1-t0 t2 = time.time() positions = np.vstack([xx.ravel(), yy.ravel()]) t3 = time.time() print "Positions: ", t3-t2 t4 = time.time() values = np.vstack([x, y]) t5 = time.time() print "Values: ", t5-t4 t6 = time.time() kernel = st.gaussian_kde(values) t7 = time.time() print "Kernel: ", t7-t6 t8 = time.time() f = np.reshape(kernel(positions).T, xx.shape) t9 = time.time() print "Reshape: ", t9-t8 # print f.shape cdict1 = {'blue': ((0.00, 1.0, 1.0), (0.10, 1.0, 1.0), (0.20, 1.0, 1.0), (0.40, 1.0, 1.0), (0.60, 1.0, 1.0),
#True Peak location pt = np.array([np.round(rib / 2), np.round(rib / 2)]) #Setting the background and the consequent point source flux bkg = 100 bkg_arr = np.random.poisson(np.ones_like(X) * bkg, X.shape) flux = 100 * (1 / 0.001) sig = 1.5 #Draw PSF from peaks with 1.5 pixel falloff, and add background d = flux * np.exp((-(pt[0] - Y)**2 - (pt[1] - X)**2) / sig**2) d += bkg_arr #Adding poissoninan noise to bkg '''Estimate background instead using the KDE method''' dd = d[d < np.nanpercentile(d, [75])] kernel = stats.gaussian_kde(dd.flatten(), bw_method='scott') alpha = np.linspace(dd.min(), dd.max(), 10000) bkg_est[idx] = alpha[np.argmax(kernel(alpha))] # bkg_est[idx] = np.median(d[d < np.nanpercentile(d,[20])]) if plots_on: if idx == 0 or idx == 3: fig, ax = plt.subplots() c = ax.imshow(d) fig.colorbar(c, label='Flux (arbitrary units)') ax.set_xlabel('Pixel #') ax.set_ylabel('Pixel #') ax.set_title('Total pixels: ' + str(numpix[idx])) plt.show()
def ALvsNLcomp19(): fig = plt.figure() ax1 = fig.add_axes([0, 0, 0.5, 0.9], xlim=(0.35, 0.51)) ax2 = fig.add_axes([0.6, 0, 0.5, 0.9], xlim=(3, 6)) fig.suptitle('American vs National League Comparisons, 2019 Season') ax1.set_xlabel('AL vs NL Slugging Percentage (%)') ax2.set_xlabel('AL vs NL Earned Run Average') ax1.hist([AL19['SLG'], NL19['SLG']], bins=8, label=['AL', 'NL'], linewidth=1, density=True, alpha=0.4, edgecolor='black', align='right') ax2.hist([AL19['ERA'], NL19['ERA']], bins=8, label=['AL', 'NL'], linewidth=1, density=True, alpha=0.4, edgecolor='black', align='right') En, Ex = min(AL19['ERA']) - 0.2 * (np.mean(AL19['ERA'])), max( AL19['ERA']) + 0.1 * (np.mean(AL19['ERA'])) Bn, Bx = min(AL19['SLG']) - 0.2 * (np.mean(AL19['SLG'])), max( AL19['SLG']) + 0.2 * (np.mean(AL19['SLG'])) kde_BA = np.linspace(Bn, Bx, 301) kde_ERA = np.linspace(En, Ex, 301) AL_BA_kde19 = st.gaussian_kde(AL19['SLG'].dropna()) NL_BA_kde19 = st.gaussian_kde(NL19['SLG'].dropna()) AL_ERA_kde19 = st.gaussian_kde(AL19['ERA'].dropna()) NL_ERA_kde19 = st.gaussian_kde(NL19['ERA'].dropna()) ax1.plot(kde_BA, AL_BA_kde19.pdf(kde_BA), color='blue', linewidth=2, alpha=0.8) ax1.plot(kde_BA, NL_BA_kde19.pdf(kde_BA), color='orange', linewidth=2.5, alpha=0.95) ax2.plot(kde_ERA, AL_ERA_kde19.pdf(kde_ERA), color='blue', linewidth=2, alpha=0.8) ax2.plot(kde_ERA, NL_ERA_kde19.pdf(kde_ERA), color='orange', linewidth=2, alpha=0.8) ax1.xaxis.set_ticks(np.arange(0.36, 0.50, 0.02)) ax2.xaxis.set_ticks(np.arange(3.0, 6.0, 0.5)) ax1.legend(loc='upper left') ax2.legend(loc='upper left') plt.show() plt.close()
def main(args): # matplotlib settings plt.rc('font', family='serif') if args.two_col: plt.rc('xtick', labelsize=11) plt.rc('ytick', labelsize=11) plt.rc('axes', labelsize=11) plt.rc('axes', titlesize=11) plt.rc('legend', fontsize=11) plt.rc('legend', title_fontsize=11) plt.rc('lines', linewidth=1) plt.rc('lines', markersize=3) width = 3.25 # Two column style width, height = set_size(width=width * 2, fraction=1, subplots=(2, 2)) fig, axs = plt.subplots(2, 2, figsize=(width, height * 1.25)) else: # matplotlib settings plt.rc('font', family='serif') plt.rc('xtick', labelsize=18) plt.rc('ytick', labelsize=18) plt.rc('axes', labelsize=21) plt.rc('axes', titlesize=21) plt.rc('legend', fontsize=19) plt.rc('legend', title_fontsize=11) plt.rc('lines', linewidth=1) plt.rc('lines', markersize=6) width = 5.5 # Neurips 2020 width, height = set_size(width=width * 3, fraction=1, subplots=(1, 4)) fig, axs = plt.subplots(1, 4, figsize=(width, height * 1.25)) axs = axs.flatten() results = get_results(args) train_feature_vals = results['train_feature_vals'] train_feature_bins = results['train_feature_bins'] train_pos_ndx = results['train_pos_ndx'] train_neg_ndx = results['train_neg_ndx'] train_weight = results['train_weight'] train_sim = results['train_sim'] feature_name = results['target_feature'] test_val = results['test_val'] train_sim_weight = train_weight * train_sim # gamma vs alpha print('plotting gamma vs alpha...') ax = axs[0] xy = np.vstack([train_weight, train_sim]) z = gaussian_kde(xy)(xy) ax.scatter(train_weight, train_sim, c=z, s=20, edgecolor='', rasterized=args.rasterize) ax.axhline(0, color='k') ax.axvline(0, color='k') ax.set_ylabel(r'$\gamma$') ax.set_xlabel(r'$\alpha \hat{y}$') # unweighted print('plotting unweighted...') ax = axs[1] ax.hist(train_feature_vals[train_pos_ndx], bins=train_feature_bins, color='g', hatch='.', alpha=args.alpha, label='positive instances') ax.hist(train_feature_vals[train_neg_ndx], bins=train_feature_bins, color='r', hatch='\\', alpha=args.alpha, label='negative instances') ax.axvline(test_val, color='k', linestyle='--') ax.set_xlabel(feature_name.capitalize()) ax.set_ylabel('Density') ax.set_title('Unweighted') ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) ax.tick_params(axis='both', which='major') # weighted by TREX's global weights print('plotting weighted by global weights...') ax = axs[2] ax.hist(train_feature_vals[train_pos_ndx], bins=train_feature_bins, color='g', hatch='.', alpha=args.alpha, weights=train_weight[train_pos_ndx]) ax.hist(train_feature_vals[train_neg_ndx], bins=train_feature_bins, color='r', hatch='\\', alpha=args.alpha, weights=train_weight[train_neg_ndx]) ax.axvline(test_val, color='k', linestyle='--') ax.set_ylabel('Density') ax.set_xlabel(feature_name.capitalize()) ax.set_title(r'Weighted by $\alpha \hat{y}$') ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) ax.tick_params(axis='both', which='major') # weighted by TREX's global weights * similarity to the test instance print('plotting weighted by weight * similarity...') train_sim_weight = train_weight * train_sim ax = axs[3] ax.hist(train_feature_vals[train_pos_ndx], bins=train_feature_bins, color='g', hatch='.', alpha=args.alpha, weights=train_sim_weight[train_pos_ndx], label='pos samples') ax.hist(train_feature_vals[train_neg_ndx], bins=train_feature_bins, color='r', hatch='\\', alpha=args.alpha, weights=train_sim_weight[train_neg_ndx], label='neg samples') ax.axvline(test_val, color='k', linestyle='--') ax.legend(frameon=False) ax.set_ylabel('Density') ax.set_xlabel(feature_name.capitalize()) ax.set_title(r'Weighted by $\alpha \hat{y} \gamma$') ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) ax.tick_params(axis='both', which='major') # save plot out_dir = os.path.join(args.out_dir, args.tree_kernel) os.makedirs(out_dir, exist_ok=True) plt.tight_layout() if not args.two_col: fig.subplots_adjust(wspace=0.25, hspace=0.05) plt.savefig(os.path.join(out_dir, 'misclassification.{}'.format(args.ext)))
import pandas as pd import matplotlib.pyplot as plt import scipy.stats as stat df = pd.DataFrame({ 'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'], 'Salary': [50000, 54000, 50000, 189000, 55000, 40000, 59000] }) salary = df['Salary'] salary.plot.hist(title='Salary Distribution', color='lightblue', bins=25) plt.axvline(salary.mean(), color='magenta', linestyle='dashed', linewidth=2) plt.axvline(salary.median(), color='green', linestyle='dashed', linewidth=2) #plt.show() df = pd.DataFrame({ 'Test': [ 172, 174, 176, 172, 172, 173, 176, 172, 177, 174, 176, 175, 176, 169, 175, 174, 174, 174, 175, 173, 171, 171, 175, 175, 173, 175, 175 ] }) test = df["Test"] test.plot.hist(title='Test') density = stat.gaussian_kde(test) n, x, _ = plt.hist(test, histtype='step', density=True, bins=10) plt.plot(x, density(x) * 5) plt.show() print(df["Test"].std()) print(df['Test'].median) print(df.describe())
def kde_estimation(data, est_x, n_data, bin_interval): estimator = stats.gaussian_kde(data) kde = estimator(est_x) kde = kde * n_data * bin_interval return kde
############### # # Translate R to Python Copyright (c) 2016 Masahiro Imai Released under the MIT license # ############### import pandas import matplotlib.pyplot as plt from scipy.stats import gaussian_kde import numpy as np import seaborn fish = pandas.read_csv('2-2-1-fish.csv') print(fish.head()) kde = gaussian_kde(fish['length']) fig = plt.figure(figsize=(10,5)) x_grid = np.linspace(0, max(fish['length']), num=100) weights = np.ones_like(fish['length'])/float(len(fish['length'])) ax1 = fig.add_subplot(1, 2, 1) ax1.hist(fish['length'], weights=weights) ax1.set_xlabel('length') ax1.set_ylabel('count') ax2 = fig.add_subplot(1, 2, 2) ax2.plot(np.linspace(0, np.max(fish['length'])), kde(np.linspace(0, np.max(fish['length'])))) ax2.set_xlabel('length') ax2.set_ylabel('density')
def test_kde_integer_input(): """Regression test for #1181.""" x1 = np.arange(5) kde = stats.gaussian_kde(x1) y_expected = [0.13480721, 0.18222869, 0.19514935, 0.18222869, 0.13480721] assert_array_almost_equal(kde(x1), y_expected, decimal=6)
def getPz(args): ''' Compute the P(z) from nearest neighbors P(z)'s given a reference (training) file. INPUT - fileInNames: fits file names: inputFile^trainingFile - keys: column names describing dimensions, example; "MAG_I,MAG_G-MAG_R,MAG_R-MAG_I" - selection: selection strings inputSelect^trainingSelect NOTES: - both the photoz_file and reference_file must contain the same column names as defined by "keys" OUTPUT - P(z)'s of input file objects + input columns if merge_with_input is set ''' """ options """ # for large dataset sys.setrecursionlimit(100000) from sklearn.neighbors import KernelDensity verbose = True fileInName = args.input.split("^") if args.select is not None: fileInSelect = args.select.split("^") else: fileInSelect = [None for f in fileInName] if verbose: sys.stderr.write("Reading input files...") """ input file """ sample, sampleSelect = getCols(fileInName[0], args.keys.split(","), selection=fileInSelect[0], array=True) if args.keys_err is not None: sample_err, _ = getCols(fileInName[0], args.keys_err.split(","), selection=fileInSelect[0], array=True) (Nsample, Ndim) = sample.shape sample[np.logical_not(np.isfinite(sample))] = 0.0 # set NaNs and inf to 0 if Nsample == 0: raise ValueError("The input file is empty, exiting...") """ training file """ ref, select = getCols(fileInName[1], args.keys_ref.split(","), selection=fileInSelect[1], array=True) if args.keys_err is not None: ref_err, select = getCols(fileInName[1], args.keys_err.split(","), selection=fileInSelect[1], array=True) (Nref, Ndim) = ref.shape ref[np.logical_not(np.isfinite(ref))] = 0.0 # set NaNs and inf to 0 # for reference: histo or sum of PDFs if args.PDF_histo: # if histogram, set bins bins = np.linspace(0.0, 6.0, num=601, endpoint=True) PDF_ref_bins = 0.5 * (bins[1:] + bins[:-1]) # get reference redshifts and weights # set with -keys_histo redshift,weight keys_histo = args.keys_histo.split(",") if len(keys_histo) > 1: #[z_ref, weight, source], select = getCols(fileInName[1], keys_histo, selection=fileInSelect[1]) [z_ref, weight], select = getCols(fileInName[1], keys_histo, selection=fileInSelect[1]) else: [z_ref], select = getCols(fileInName[1], keys_histo, selection=fileInSelect[1]) weight = np.ones(len(z_ref)) else: # if PDF, recover PDF from reference PDF_ref, PDF_ref_bins = getPDF(fileInName[1], normalise=args.no_norm, select=select, PDF_key="PDF_L15") weight = np.ones(len(PDF_ref)) if verbose: sys.stderr.write("done\n") """ Build tree of colors for reference """ if verbose: sys.stderr.write("Building reference tree...") tree = spatial.KDTree(ref) if verbose: sys.stderr.write("done\n") # does not work: # import pickle # pickle.dump(tree, open("ref.pickle", 'w+')) # tree = pickle.load(open("ref.pickle", 'rb')) # return """ main loop """ # tests Nsample = 1000 Nnei = 50 kde = False # see https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ # output arrays pofz = np.zeros((Nsample, len(PDF_ref_bins)), dtype=np.float32) if args.PDF_histo: zTrain = np.zeros((Nsample, Nnei), dtype=np.float32) if len(keys_histo) > 1: wTrain = np.zeros((Nsample, Nnei), dtype=np.float32) # sTrain = np.zeros((Nsample, Nnei), dtype=np.float32) est = collections.OrderedDict() for name in [ 'zmean', 'zmode', 'zmedian', 'z_std', 'zl95', 'zl68', 'zh68', 'zh95', 'z_mc', 'zconf' ]: est[name] = np.zeros(Nsample) - 99.0 zmin = PDF_ref_bins[0] zmax = PDF_ref_bins[-1] for i in range(Nsample): # for i in range(1): # photometry failure if abs(np.sum(sample[i, :])) < EPS: continue # find nearest neighbors (d, indices) = tree.query(sample[i, :], Nnei) # associated errors derr = np.ones(Nnei) if args.keys_err is not None: for n, j in enumerate(indices): err = np.sum(sample_err[i, :] + ref_err[j, :]) if err > EPS: derr[n] = err # weight: 1/distance * 1/err * input_weight w = np.ones(len(indices)) for n, j in enumerate(indices): if d[n] > EPS: w[n] = 1.0 / d[n] * 1.0 / derr[n] * weight[j] if args.PDF_histo: zTrain[i, :] = z_ref[indices] if len(keys_histo) > 1: wTrain[i, :] = w # sTrain[i, :] = source[indices] if kde: z_weighted = weightedSample(z_ref[indices], w) std = np.std(z_weighted) if (std < 1.e-4) | (len(z_weighted) < 2): pofz[i, :], _ = np.histogram(z_ref[indices], bins=bins, density=True, weights=w) else: density = gaussian_kde(z_weighted, bw_method=0.03 / std) pofz[i, :] = density.pdf(PDF_ref_bins) # density = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(z_weighted[:, np.newaxis]) # pofz[i, :] = density.score_samples(PDF_ref_bins[:, np.newaxis]) else: pofz[i, :], _ = np.histogram(z_ref[indices], bins=bins, density=True, weights=w) else: for n, j in enumerate(indices): if (d[n] > EPS) & (np.sum(PDF_ref[j, :]) > EPS): pofz[i, :] += PDF_ref[j, :] * w[n] # PDF_sample_inter = np.interp(PDF_ref_bins, PDF_sample_bins, PDF_sample[i,:]) # pofz[i,:] *= PDF_sample_inter # normalise PDF norm = int_trapz(PDF_ref_bins, pofz[i, :], PDF_ref_bins[0], PDF_ref_bins[-1]) if norm > EPS: pofz[i, :] /= norm est['zmean'][i] = int_trapz(PDF_ref_bins, pofz[i, :] * PDF_ref_bins, zmin, zmax) est['zmode'][i] = max_pos_PDF(PDF_ref_bins, pofz[i, :]) est['zmedian'][i] = medianfromDist(PDF_ref_bins, pofz[i, :]) est['z_std'][i] = np.sqrt( int_trapz( PDF_ref_bins, pofz[i, :] * pow(PDF_ref_bins - est['zmean'][i], 2.0), zmin, zmax)) est['zl95'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :], q=0.05 / 2.0) est['zl68'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :], q=0.32 / 2.0) est['zh68'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :], q=1.0 - 0.32 / 2.0) est['zh95'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :], q=1.0 - 0.05 / 2.0) est['z_mc'][i] = sampleFromDist(PDF_ref_bins, pofz[i, :]) est['zconf'][i] = int_trapz( PDF_ref_bins, pofz[i, :], est['zmedian'][i] - 0.03 * (1.0 + est['zmedian'][i]), est['zmedian'][i] + 0.03 * (1.0 + est['zmedian'][i])) # for name in est.keys(): # print "{0:s}:{1}".format(name,est[name][i]) # test #z_weighted = weightedSample(PDF_ref_bins[pofz[i,:]>EPS], pofz[i,:][pofz[i,:]>EPS]) #density = gaussian_kde(z_weighted) #pofz_KDE = density.pdf(PDF_ref_bins) #z_median[i] = medianfromDist(PDF_ref_bins, pofz_KDE) if verbose: if (i + 1) % 1000 == 0: sys.stderr.write("\r" + "P(z): computed {0:d} objects".format(i + 1)) sys.stderr.flush() if verbose: sys.stderr.write("\r" + "P(z): computed {0:d} objects\n".format(i + 1)) """ write output file """ if verbose: sys.stderr.write("Writing output file...") cols = [] if args.key_id is not None: [ID], _ = getCols(fileInName[0], [args.key_id], select=fileInSelect[0]) cols.append(fits.Column(name="ID", format='K', array=ID)) for name in est.keys(): cols.append(fits.Column(name=name, format='E', array=est[name])) #cols.append(fits.Column(name='zmedian', format='E', array=zmedian)) cols.append( fits.Column(name='PDF', format=str(len(PDF_ref_bins)) + 'E', array=pofz)) if args.PDF_histo: cols.append( fits.Column(name='zTrain', format=str(Nnei) + 'E', array=zTrain)) if len(keys_histo) > 1: cols.append( fits.Column(name='wTrain', format=str(Nnei) + 'E', array=wTrain)) # cols.append(fits.Column(name='sTrain', format=str(Nnei)+'I', array=sTrain)) cols_bins = [] # cols_bins.append(fits.Column(name='PDF', format=str(len(PDF_ref_bins))+'E', array=[PDF_ref_bins])) cols_bins.append(fits.Column(name='BINS', format='E', array=PDF_ref_bins)) # cols_bins.append(fits.Column(name='Z_MIN', format='E', array=[PDF_ref_bins[0]])) # cols_bins.append(fits.Column(name='Z_MAX', format='E', array=[PDF_ref_bins[-1]])) # cols_bins.append(fits.Column(name='DELTA_Z', format='E', array=[PDF_ref_bins[1]-PDF_ref_bins[0]])) if args.merge_with_input: fileIn = fits.open(fileInName[0]) hdu_0 = fileIn[0] if sampleSelect is not None: for c in fileIn[1].columns: cols.append( fits.Column(name=c.name, format=c.format, array=fileIn[1].data[c.name][sampleSelect])) hdu_1 = fits.BinTableHDU.from_columns(fits.ColDefs(cols)) else: hdu_1 = fits.BinTableHDU.from_columns(fileIn[1].columns + fits.ColDefs(cols)) if len(fileIn) > 2: hdu_2 = fits.BinTableHDU.from_columns(fileIn[2].columns + fits.ColDefs(cols_bins)) else: hdu_2 = fits.BinTableHDU.from_columns(fits.ColDefs(cols_bins)) else: hdu_0 = fits.PrimaryHDU() hdu_1 = fits.BinTableHDU.from_columns(fits.ColDefs(cols)) hdu_2 = fits.BinTableHDU.from_columns(fits.ColDefs(cols_bins)) hdu_1.header["EXTNAME"] = "DATA" hdu_2.header["EXTNAME"] = "BINS" if args.PDF_histo: hdu_1.header["z_min"] = bins[0] hdu_1.header["z_max"] = bins[-1] hdu_1.header["delta_z"] = bins[1] - bins[0] else: hdu_1.header["z_min"] = PDF_ref_bins[0] hdu_1.header["z_max"] = PDF_ref_bins[-1] hdu_1.header["delta_z"] = PDF_ref_bins[1] - PDF_ref_bins[0] tbhdu = fits.HDUList([hdu_0, hdu_1, hdu_2]) tbhdu.writeto(args.output, overwrite=True) if args.merge_with_input: fileIn.close() if verbose: sys.stderr.write("done\n") if args.plot: [zs], _ = getCols(fileInName[0], ["redshift"], select=fileInSelect[0]) print "Stats (scatter, eta, bias, eta_2sig, N) =", stats( est['zmedian'], zs, [0.0, 6.0]) # PDF_sample, PDF_sample_bins = getPDF(fileInName[0], normalise=args.no_norm, PDF_key="PDF_lephare") # plotPDF(pofz, PDF_ref_bins, Nsample, "PDF.pdf", zs=zs, zp=est['zmedian'], PDF2=[PDF_sample, PDF_sample_bins, "lephare"]) plotPDF(pofz, PDF_ref_bins, Nsample, "PDF.pdf", zs=zs, zp=est['zmedian']) return
import matplotlib matplotlib.rc('font', family='Arial') import pickle import numpy as np with open('flu_preds.pkl','rb') as f: antigen,predicted,Y = pickle.load(f) # with open('ebv_preds.pkl','rb') as f: # antigen,predicted,Y = pickle.load(f) # # with open('mart1_preds.pkl','rb') as f: # antigen,predicted,Y = pickle.load(f) x = predicted y = Y xy = np.vstack([x, y]) z = gaussian_kde(xy)(xy) r = np.argsort(z) x, y, z = x[r], y[r], z[r] plt.figure(figsize=(6,5)) plt.scatter(x, y, s=15, c=z, cmap=plt.cm.jet) plt.title(antigen, fontsize=18) plt.xlim([0, 10]) plt.ylim([0, 10]) plt.xlabel('Predicted', fontsize=24) plt.ylabel('Log2(counts+1)', fontsize=24) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.subplots_adjust(bottom=0.15) plt.savefig(antigen+'.png',dpi=1200)
def kde(x, x_grid, bandwidth=0.2): "Kernel-Density Estimate using Gaussian Kernels." kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1)) return kde.evaluate(x_grid)
else: # We had to give up looking for valid points near refpt, so remove it # from the list of "active" points. active.remove(idx) #print(samples) def column(matrix, i): return [row[i] for row in matrix] datax = column(samples, 0) datay = column(samples, 1) data = np.array(samples) kde_uni = stats.gaussian_kde(uniform_noise.sample().T) density_uni = kde_uni(uniform_noise.sample().T) #normalize_density = density/max(density) kde = stats.gaussian_kde(data.T) density = kde(data.T) normalize_density = density / max(density) cmap = cm.jet #cm.hot #'Blues' counts, xedges, yedges = np.histogram2d(datax, datay, bins=(60, 45)) #print(counts.shape) #print(np.amax(counts)) #print(counts) xidx = np.clip(np.digitize(datax, xedges), 0, counts.shape[0] - 1) yidx = np.clip(np.digitize(datay, yedges), 0, counts.shape[1] - 1)
def plot_densities(x, nbins=25, tit=''): for xx in x: density = stats.gaussian_kde(xx) h = np.histogram(xx, bins=get_bins(xx, nbins)[1:-1])[1] plt.plot(h, density(h)) plt.title(tit)
def KDE(data): kernel = gaussian_kde(dataset=data, bw_method='silverman') return kernel
def _get_kdes(train_ats, train_pred, class_matrix, args): """Kernel density estimation Args: train_ats (list): List of activation traces in training set. train_pred (list): List of prediction of train set. class_matrix (list): List of index of classes. args: Keyboard args. Returns: kdes (list): List of kdes per label if classification task. removed_cols (list): List of removed columns by variance threshold. """ removed_cols = [] if args.is_classification: for label in range(args.num_classes): col_vectors = np.transpose(train_ats[class_matrix[label]]) if args.d == 'imagenet' and (args.model == 'densenet201' or args.model == 'efficientnetb7'): continue else: for i in range(col_vectors.shape[0]): if (np.var(col_vectors[i]) < args.var_threshold and i not in removed_cols): removed_cols.append(i) # import pdb; pdb.set_trace() kdes = {} for label in tqdm(range(args.num_classes), desc="kde"): refined_ats = np.transpose(train_ats[class_matrix[label]]) if args.d == 'imagenet' and (args.model == 'densenet201' or args.model == 'efficientnetb7'): pass else: refined_ats = np.delete(refined_ats, removed_cols, axis=0) if refined_ats.shape[0] == 0: print( warn("ats were removed by threshold {}".format( args.var_threshold))) break kdes[label] = gaussian_kde(refined_ats) # import pdb; pdb.set_trace() # print(gaussian_kde(refined_ats)) import pdb pdb.set_trace() else: col_vectors = np.transpose(train_ats) for i in range(col_vectors.shape[0]): if np.var(col_vectors[i]) < args.var_threshold: removed_cols.append(i) refined_ats = np.transpose(train_ats) refined_ats = np.delete(refined_ats, removed_cols, axis=0) if refined_ats.shape[0] == 0: print( warn("ats were removed by threshold {}".format( args.var_threshold))) kdes = [gaussian_kde(refined_ats)] print(infog("The number of removed columns: {}".format(len(removed_cols)))) return kdes, removed_cols
sig = dyRet.std()[0] yRet = np.array(dyRet) plt.subplot(221) plt.hist(yRet, normed=True, bins=100, color='grey') distance = np.linspace(min(yRet), max(yRet)) plt.plot(distance, norm.pdf(distance, mu, sig), c='r') plt.xlabel('log return') plt.ylabel('density') plt.legend(loc="upper right", fontsize=5) plt.subplot(222) yNRet = (yRet - mu) / sig #standardization distanceN = np.squeeze(np.linspace(min(yNRet), max(yNRet))) kernel = gaussian_kde(np.squeeze(yNRet)) plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r') plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey') plt.legend(loc="upper right", fontsize=5) plt.subplot(223) plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r') plt.plot(distanceN, t.pdf(distanceN, df=2), label='t-dist, df=2', c='g') plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey') plt.legend(loc="upper right", fontsize=5) plt.subplot(224) plt.plot(distanceN, norm.pdf(distanceN, 0, 1), label='Normal', c='r') plt.plot(distanceN, kernel(distanceN), label='empirical', c='grey') plt.plot(distanceN, t.pdf(distanceN, df=2), label='t-dist, df=2', c='g') plt.plot(distanceN, laplace.pdf(distanceN), label='laplace-dist', c='y')
MPL_plateifu = Table_mpl5['PLATEIFU'].astype('object') MPL_ID = [ID.strip() for ID in MPL_plateifu] Table_weights = pd.DataFrame({'plateifu':MPL_ID,'weight':Weights}) data_SF_w = pd.merge(Table_weights, data_SF) P0_w = data_SF_w[data_SF_w.mode_flag!=0] P0_w = P0_w[P0_w.re_arc>2.5] P1_w, P2_w = P0_w[P0_w.mode_flag==1], P0_w[P0_w.mode_flag==-1] #--------Merging Measurements Over--------# # Use 80% data to fit W = data_SF[data_SF.mode_flag!=0] xx, yy = np.mgrid[5:11:60j, -5:1:60j] positions = np.vstack([xx.ravel(), yy.ravel()]) W0 = W.sample(frac=0.1) kernel = gaussian_kde(np.vstack([W0.Sigma_Mass, W0.Sigma_SFR])) print "Use KDE to derive PDF... %d points used."%len(W0) pdf = pd.Series(kernel.pdf(np.vstack([W.Sigma_Mass, W.Sigma_SFR]))) use_80 = (pdf > pdf.quantile(0.2)).values W_80 = W[use_80] print "Select 80% of data... Finish!" # plot Fig.2 mp,sfrp,cof = median_fitting(P0.Sigma_Mass, P0.Sigma_SFR, q=0.01,d=7) m_i,sfr_i,cof_i = median_fitting(P1.Sigma_Mass, P1.Sigma_SFR, q=0.01,d=7) m_o,sfr_o,cof_o = median_fitting(P2.Sigma_Mass, P2.Sigma_SFR, q=0.01,d=7) plt.figure(figsize=(15,5.)) for i, (W,c,cmap,lab,p) in enumerate(zip([P0,P1,P2],['k','r','b'],['Greys','Reds','Blues'],['Total','Inside-out','Outside-in'],['a','b','c'])): ax = plt.subplot(1,3,i+1) plt.text(0.018,0.05,"%s)"%p,fontsize='large',transform=ax.transAxes)