def plot_quantiles(data, err=None, quantiles=None, axes=None, colors=None, labels=None, kde=None, bw=.008): """ plotting function for displaying model-predicted quantile-probabilities over empirical estimates """ y_data, yhat_data = data c, c_hat = colors if axes is not None: axc, axe = axes else: f, (axc, axe) = plt.subplots(1, 2, figsize=(10, 4)) qc, qc_hat = y_data[1], yhat_data[1] qe, qe_hat = y_data[2], yhat_data[2] if quantiles is None: quantiles = np.linspace(.1, .9, qc.size) if err is not None: qc_err, qe_err = err else: qc_err, qe_err = [np.zeros(len(qc))]*2 if kde is not None: qc_kde, qe_kde = kde[1], kde[2] sns.kdeplot(qc_kde, cumulative=1, color=c, ax=axc, linewidth=2, linestyle='-', bw=bw) sns.kdeplot(qe_kde, cumulative=1, color=c, ax=axe, linewidth=2, linestyle='-', bw=bw) axc.errorbar(qc, quantiles, xerr=qc_err, color=c, linewidth=0, elinewidth=1.5, marker='o', ms=5, label=labels) axe.errorbar(qe, quantiles, xerr=qe_err, color=c, linewidth=0, elinewidth=1.5, marker='o', ms=5, label=labels) axc.plot(qc_hat, quantiles, mec=c_hat, linewidth=0, marker='o', ms=10, mfc='none', mew=1.7, label=labels) axe.plot(qe_hat, quantiles, mec=c_hat, linewidth=0, marker='o', ms=10, mfc='none', mew=1.7, label=labels)
def dist_small_multiples(df, figsize=(20, 20)): """ Small multiples plots of the distribution of a dataframe's variables. """ import math sns.set_style("white") num_plots = len(df.columns) n = int(math.ceil(math.sqrt(num_plots))) fig = plt.figure(figsize=figsize) axes = [plt.subplot(n, n, i) for i in range(1, num_plots + 1)] i = 0 for k, v in df.iteritems(): ax = axes[i] sns.kdeplot(v, shade=True, ax=ax, legend=False) sns.rugplot(v, ax=ax, c=sns.color_palette("husl", 3)[0]) [label.set_visible(False) for label in ax.get_yticklabels()] ax.xaxis.set_ticks([v.min(), v.max()]) ax.set_title(k) i += 1 sns.despine(left=True, trim=True, fig=fig) plt.tight_layout() return fig, axes
def plot_retest_data(retest_data, size=4.6, save_dir=None): colors = [sns.color_palette('Reds_d',3)[0], sns.color_palette('Blues_d',3)[0]] f = plt.figure(figsize=(size,size*.75)) # plot boxes with sns.axes_style('white'): box_ax = f.add_axes([.15,.1,.8,.5]) sns.boxplot(x='icc3.k', y='Measure Category', ax=box_ax, data=retest_data, palette={'Survey': colors[0], 'Task': colors[1]}, saturation=1, width=.5, linewidth=size/4) box_ax.text(0, 1, '%s Task measures' % Task_N, color=colors[1], fontsize=size*2) box_ax.text(0, 1.2, '%s Survey measures' % Survey_N, color=colors[0], fontsize=size*2) box_ax.set_ylabel('Measure category', fontsize=size*2, labelpad=size) box_ax.set_xlabel('Intraclass correlation coefficient', fontsize=size*2, labelpad=size) box_ax.tick_params(labelsize=size*1.5, pad=size, length=2) [i.set_linewidth(size/5) for i in box_ax.spines.values()] # plot distributions dist_ax = f.add_axes([.15,.6,.8,.4]) dist_ax.set_xlim(*box_ax.get_xlim()) dist_ax.set_xticklabels('') dist_ax.tick_params(length=0) for i, (name, g) in enumerate(retest_data.groupby('Measure Category')): sns.kdeplot(g['icc3.k'], color=colors[i], ax=dist_ax, linewidth=size/3, shade=True, legend=False) dist_ax.set_ylim((0, dist_ax.get_ylim()[1])) dist_ax.axis('off') if save_dir: plt.savefig(save_dir, dpi=dpi, bbox_inches='tight')
def kde_tissue(tissue, q, genes, x, y, dfplot, dfindex, ax, label, col= 'b'): """ Plots all the tissue specific genes,i.e. all genes that appear in one and only one 'tissue' tissue -- tissue to plot q -- qvalue to slice on dfindex -- the dataframe generated by organizer dfplot -- the dataframe containing columns x, y and genes x -- the name of the column containing the values to plot in the histogram y -- the name of the column with which to slice the dataframe (q or p value) genes -- the name of the column containing the WBID names label -- name of the plot just made ax -- axis to plot in col -- color """ g= lambda x:((dfindex.expressed == 1) & (dfindex.tissue == x))\ # & (~dfindex[dfindex.expressed == 1].duplicated('gene')) f= lambda x: (dfplot[genes].isin(x)) & (dfplot[y] < q) gene_selection= g(tissue) genes_to_plot= dfindex[gene_selection].gene ind= f(genes_to_plot) to_plot= dfplot[ind][x] n= len(dfplot[ind][genes].unique()) if len(to_plot) > 15: sns.kdeplot(to_plot, color= col,label= label+' n= {0}'.format(n), ax= ax, lw= 5, cut=0.5) if len(to_plot) <= 20: sns.rugplot(to_plot, color= col, ax= ax, height= .07, lw= 2)
def kde_target(var_name, df): """用于单个特征的kde可视化 返回信息: 与 target 的 相关度(皮尔逊相关系数) not repaid 的特征中位数 repaid 的特征中位数 """ # Calculate the correlation coefficient between the new variable and the target corr = df['TARGET'].corr(df[var_name]) # Calculate medians for repaid vs not repaid avg_repaid = df.ix[df['TARGET'] == 0, var_name].median() avg_not_repaid = df.ix[df['TARGET'] == 1, var_name].median() plt.figure(figsize=(12, 6)) # Plot the distribution for target == 0 and target == 1 sns.kdeplot(df.ix[df['TARGET'] == 0, var_name], label='TARGET == 0') sns.kdeplot(df.ix[df['TARGET'] == 1, var_name], label='TARGET == 1') # label the plot plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name) plt.legend(); plt.show() # print out the correlation print('%s与标签相关度 %0.4f' % (var_name, corr)) print('not repaid = %0.4f' % avg_not_repaid) print('repaid = %0.4f' % avg_repaid)
def plot_marker_distribution(datalist, namelist, labels, grid_size, fig_path=None, letter_size=16): nmark = len(labels) assert len(datalist) == len(namelist) g_i, g_j = grid_size colors = sns.color_palette("Set1", n_colors=len(datalist), desat=.5) fig = plt.figure() grid = gridspec.GridSpec(g_i, g_j, wspace=0.1, hspace=0.05) for i in range(g_i): for j in range(g_j): seq_index = g_j * i + j if seq_index < nmark: ax = fig.add_subplot(grid[i,j]) start = .5 ax.text(start,.85, labels[seq_index], horizontalalignment='center', transform=ax.transAxes, size=letter_size) for i_name, (name, x) in enumerate(zip(namelist, datalist)): lower = np.percentile(x[:,seq_index], 0.5) upper = np.percentile(x[:,seq_index], 99.5) if seq_index == nmark - 1: sns.kdeplot(x[:,seq_index], color=colors[i_name], label=name, clip=(lower, upper)) else: sns.kdeplot(x[:,seq_index], color=colors[i_name], clip=(lower, upper)) clean_axis(ax) plt.legend(loc="upper right", prop={'size':letter_size}) if fig_path is not None: plt.savefig(fig_path, format='eps') plt.close() else: plt.show()
def create1Ddensityplot(data, outputfilename): plt.clf() f, (ax1) = plt.subplots(1, 1, sharex=True, figsize=(8, 6)) # with sns.axes_style("white"): #sns.jointplot("compression", "wiener index",atomizationInfo, kind="kde"); sns.kdeplot(data, shade=True, ax=ax1, clip=(0, 1), bw=0.5) plt.savefig(outputfilename)
def plot_KL(data): """Kullback-Leibler divergence, given a Dataset object. The 'true' distribution is the data one""" frequencies = data.frequencies Ncat = data.Ncat fiducial = data.generate_mc(100) sh, loc, sc = data.lognorm_par() freq_ln = [np.sort(stats.lognorm.rvs(sh, scale=sc, size=Ncat, random_state=s))[::-1] for s in range(1, 1001)] kl_ln = [stats.entropy(frequencies, r) for r in freq_ln] lengths = [min(Ncat, len(mc)) for mc in fiducial] # Cut to the minimum Ncat kl_data = [stats.entropy(frequencies[:lengths[i]], mc[:lengths[i]]) for i, mc in enumerate(fiducial)] # Plot KL divergence. Use kdeplot instead of histogram fig = plt.figure(figsize=[10, 6.18]) plt.title('Kullback-Leibler divergence') # plt.hist(kl_data, bins=10, normed=True, label='MC', alpha=0.5) # plt.hist(kl_ln, bins=10, normed=True, label='Lognormal', alpha=0.5, color='Blue') sns.kdeplot(np.array(kl_data), label='MC', alpha=0.6, color='Blue') sns.kdeplot(np.array(kl_ln), label='Lognormal', alpha=0.6, color='Orange') plt.xlim(xmin=0.) # plt.axvline(ks_tree[0], c='Purple', label = 'Tree model') # plt.axvline(kl_ln, c='Orange', label = 'Lognormal') plt.legend(loc='best') # plt.savefig(os.path.join('all_data', 'KL_'+data.name+'.png')) return
def _plot_continuous(df, xlabel, ylabel, ax, plottype="kde", n_levels=10, cmap="YlGnBu", shade=True): """ Plot a two continuous variables against each other in a scatter plot or a kernel density estimate. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis ylabel : str The column name for the variable on the y-axis ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into plottype : {"kde" | "scatter"} The type of plot to produce. Either a kernel density estimate ("kde") or a scatter plor ("scatter"). n_levels : int the number of levels to plot for the kernel density estimate plot. Default is 10 cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles shade : bool If True, plot kernel density estimate contours in coloured shades. If False, plot only the outline of each contour. Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ xcolumn = df[xlabel] ycolumn = df[ylabel] x_clean = xcolumn[np.isfinite(xcolumn) & np.isfinite(ycolumn)] y_clean = ycolumn[np.isfinite(ycolumn) & np.isfinite(xcolumn)] if plottype == "kde": sns.kdeplot(x_clean, y_clean, n_levels=n_levels, shade=shade, ax=ax, cmap=cmap) elif plottype == "scatter": current_palette = sns.color_palette(cmap, 5) c = current_palette[2] ax.scatter(x_clean, y_clean, color=c, s=10, lw=0, edgecolor="none", alpha=0.8) return ax
def do_kdeplot(x, y, ax, n_levels=None, bw='scott'): try: sns.kdeplot(x, y, ax=ax, cut=0, cmap='Purples_d', shade=True, shade_lowest=False, n_levels=n_levels, bw=bw, rasterized=True) except: logger.warning('Unable to do a KDE fit to AUGUSTUS improvement.') pass
def plot_density(exp_res, title, xlim=(0.7, 1.0), ylim=(0.8, 1.0), cmap='Reds', saveto=None): sns.set_context("notebook", font_scale=2.0, rc={"lines.linewidth": 2.5}) sns.set_style("whitegrid") training_dfs = [] for item in exp_res: training_data, training_df, best_training_row, match_res = item training_dfs.append(training_df) combined = pd.concat(training_dfs, axis=0) combined = combined.reset_index(drop=True) f, ax = plt.subplots(figsize=(6, 6)) sns.kdeplot(combined.Rec, combined.Prec, ax=ax, cmap=cmap, shade=True, shade_lowest=False) # sns.rugplot(combined.Rec, ax=ax) # sns.rugplot(combined.Prec, vertical=True, ax=ax) ax.set_xlim(xlim) ax.set_ylim(ylim) # g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=xlim, ylim=ylim) # g = g.plot_joint(sns.kdeplot) # g = g.plot_marginals(sns.kdeplot, shade=True) # ax = g.ax_joint # ax.set_xlabel('Rec', fontsize=36) # ax.set_ylabel('Prec', fontsize=36) # ax = g.ax_marg_x ax.set_title(title, fontsize=36) # plt.tight_layout() if saveto is not None: plt.savefig(saveto)
def make_kde_plot(df, spot, runid, title=None, cmap='Greens', plotclass=None, logfile=None, debug=False): plt.figure() ptf('Plot KDE %s - %s' % (title, spot), logfile) x,y = stack_rows(df, spot) ptf('%s, %s' % (x.shape, y.shape), logfile) ptf('Check for nans', logfile) ptf('%s, %s' % (np.sum(np.isnan(x)), np.sum(np.isnan(y))), logfile) ptf('computing kde...', logfile) sns.kdeplot(x,y, shade=True, cmap=cmap) plottitle = runid + '-' + spot + ' - KDE trigger vs t' if title: plottitle += ' - ' + title if plotclass: plottitle += ' - ' + plotclass plt.title(plottitle) plt.xlabel('t (hrs)') if title: plt.ylabel(title) else: plt.ylabel('trigger metric') filename = runid + '/' + runid + '-' + spot + ' - KDE trigger vs t' if title: filename += ' - ' + title if plotclass: filename += ' - ' + plotclass ptf('Saving plot %s' % filename, logfile) plt.savefig(filename, dpi=200) if debug: plt.show() else: plt.close()
def build_reads_per_cluster(self, ax_nreads, reads_per_cluster=None): """ Draws the number of reads per cluster for each cluster ax - the axis to draw on reads_per_cluster - list, the number of reads in a cluster """ if reads_per_cluster is None: reads_per_cluster = self.reads_per_cluster if reads_per_cluster is None: raise NotImplementedError("Pickle file doesn't have data to generate this figure") sns.kdeplot(np.array(self.reads_per_cluster), ax=ax_nreads) [tick.set_rotation(90) for tick in ax_nreads.get_xticklabels()] ax_nreads.set_xlim(0,) ax_nreads.set_xlabel("N reads)") ax_nreads.set_ylabel("Frequency") return ax_nreads
def tsne_map(z, c, fig_path, colors=None, s=2, suffix='png'): c = np.squeeze(c) if colors is None: colors = sns.color_palette("Set2", len(np.unique(c))) sns.set_style('white') fig, ax = plt.subplots(figsize=(5,5)) sns.kdeplot(z[:,0], z[:,1], colors='lightgray', cmap=None, linewidths=0.5) #ax = add_contour(z[c==0], ax) for i in np.unique(c): if i > 0: plt.scatter(z[c==i, 0], z[c==i, 1], s=s, marker='o', c=colors[i], edgecolors='face') clean_axis(ax) ax.grid(False) #plt.legend(loc="upper left", markerscale=20., scatterpoints=1, fontsize=10) #plt.xlabel('tSNE axis 1', fontsize=20) #plt.ylabel('tSNE axis 2', fontsize=20) #sns.despine(left=True, bottom=True) sns.despine() plt.savefig(fig_path + '.%s' % suffix, format=suffix) plt.clf() plt.close()
def estimate_bivariate_mle_jr(): ndim = 2 size = (10000, ndim) data = np.random.normal(size=size) eta, lam = 4, -.9 skst = SkewStudent(eta=eta, lam=lam) data = skst.rvs(size=size) model = SkStJR(ndim=ndim, data=data) out = model.fit_mle() print(out) model.from_theta(out.x) fig, axes = plt.subplots(nrows=size[1], ncols=1) for innov, ax in zip(data.T, axes): sns.kdeplot(innov, ax=ax, label='data') lines = [ax.get_lines()[0].get_xdata() for ax in axes] lines = np.vstack(lines).T marginals = model.marginals(lines) for line, margin, ax in zip(lines.T, marginals.T, axes): ax.plot(line, margin, label='fitted') ax.legend() plt.show()
def build_cluster_lengths(self, ax_lengths, cluster_lengths=None): """ Selects a random sample of all cluster length and draws 2000 of them in a boxplot ax - the axis to draw on cluster_lengths - list, the length of each cluster """ if cluster_lengths is None: cluster_lengths = self.cluster_lengths if cluster_lengths is None: raise NotImplementedError("Pickle file doesn't have data to generate this figure") sns.kdeplot(np.array(self.cluster_lengths), ax=ax_lengths) [tick.set_rotation(90) for tick in ax_lengths.get_xticklabels()] ax_lengths.set_xlim(0,) ax_lengths.set_ylabel("Frequency") ax_lengths.set_xlabel("Length (bp)") return ax_lengths
def plot(df2, df3): sns.set(style="white", color_codes=True) f, ax = sns.plt.subplots() sns.kdeplot(df2.bmi, ax=ax, shade=True, color='k', gridsize=10000, clip=(15, 45)) sns.kdeplot(df3.bmi, ax=ax, shade=True, color='k', ls='dashed', gridsize=10000, clip=(15, 45)) plt.legend(['Wave 2 (1996; ages 13-20 y)', 'Wave 3 (2001; ages 19-26 y)'], fontsize=15) plt.xlim(15, 45) ax.annotate('10.9%', xy=(25, .02), xytext=(30.5, .005), color='k') ax.annotate('22.1%', xy=(25, .04), xytext=(30.5, .02), color='k') y1 = ax.lines[0].get_ydata() x1 = ax.lines[0].get_xdata() x_mask1 = np.ma.masked_less_equal(x1, 30).mask y_masked1 = np.ma.masked_array(y1, x_mask1) y2 = ax.lines[1].get_ydata() x2 = ax.lines[1].get_xdata() x_mask2 = np.ma.masked_less_equal(x2, 30).mask y_masked2 = np.ma.masked_array(y2, x_mask2) ax.fill_between(x2, np.zeros_like(y2), y_masked2, facecolor='red', interpolate=True, alpha=0.5) ax.fill_between(x1, np.zeros_like(y1), y_masked1, facecolor='white', interpolate=True) ax.fill_between(x2, np.zeros_like(y2), y_masked2, facecolor='red', interpolate=True, alpha=0.25) plt.vlines(x=30, ymin=0, ymax=0.0398, color='k', linewidth=2, alpha=1)#, ls='dashed') plt.xticks(size=15) plt.yticks(size=15) plt.ylabel('Frequency', fontsize=15) plt.xlabel('BMI (kg/m$^2$)', fontsize=15) plt.tight_layout() plt.show()
def plot_level(dist_points, dist_expert, level_id, is_hist=True, bw=None, num_bins=NUM_BINS, col_points=COL_POINTS, col_expert=COL_EXPERT, lw_expert=LW_EXPERT, title=None, xlabel=None, ylabel=None, save_to=None): if is_hist: ax = dist_points.plot.hist(bins=num_bins, color=col_points) else: if bw: ax = sns.kdeplot(dist_points, bw=bw, color=col_points) else: ax = sns.kdeplot(dist_points, color=col_points) plt.axvline(dist_expert, 0, len(dist_points), color=col_expert, lw=LW_EXPERT) plt.title(title or "Distribusi jarak pemain - Level {}".format(level_id)) plt.xlabel(xlabel or XLABEL) if ylabel: plt.ylabel(ylabel) else: plt.ylabel("Jumlah pemain" if is_hist else "Distribusi") if save_to: try: os.makedirs(save_to) except: pass plt.savefig(os.path.join(save_to, 'level{}.png'.format(level_id))) return ax
def plot_galaxy_and_stars(galaxy, stars): colors = get_distinct(3) single_frame('X [pc]', 'Y [pc]') xlim = 60 pyplot.xlim(-xlim, xlim) pyplot.ylim(-xlim, xlim) ax = pyplot.gca() import numpy as np import pandas as pd from scipy import stats, integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) p = galaxy.select(lambda x: x<60|units.parsec,["x"]) p = p.select(lambda x: x>-60|units.parsec,["x"]) p = p.select(lambda y: y<60|units.parsec,["y"]) p = p.select(lambda y: y>-60|units.parsec,["y"]) x = p.x.value_in(units.parsec) y = p.y.value_in(units.parsec) sns.kdeplot(x, y, ax=ax) m = 100*numpy.sqrt(stars.mass/stars.mass.max()) pyplot.scatter(stars.x.value_in(units.parsec), stars.y.value_in(units.parsec), c=colors[0], s=m, lw=0) # pyplot.show() pyplot.savefig("Fujii_Comparison_Figure")
def seaborn_kde(): data = np.random.multivariate_normal([0, 0], [[5, 2], [2, 2]], size=2000) data = pd.DataFrame(data, columns=['x', 'y']) for col in 'xy': sns.kdeplot(data[col], shade=True) plt.show()
def dists(self): import matplotlib.pyplot as plt import seaborn as sns print("Plotting distributions for all parameters...") keys = [k for k in self._df.keys() if not "_" in k] n_plots = len(keys) n_cols = 4 n_rows = n_plots / n_cols fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.5, n_rows*2)) for i, (key, ax) in enumerate(zip(keys, axs.ravel())): kde_args = dict(ax=ax, shade=True) if i > 0: kde_args['legend'] = False data = self._df[key] if key.startswith("n") or key.startswith("mu"): data = np.log10(data) key = "log10(%s)" % key sns.kdeplot(data, color='k', **kde_args) ax.set_xlabel(key) plt.setp(ax.get_xticklabels(), rotation=20) if (i % n_cols) == 0: ax.set_ylabel("density function") sns.despine(fig=fig) plt.tight_layout() plt.show()
def componentDensityPlot(): ''' obtains a density plot that compares the distribution of components against three model datasets ''' directory = [('bngTest', 'BNG control set'), ('curated', 'BioModels curated'), ('non_curated', 'BioModels non\n curated')] #directory = [('curated', 'BioModels curated')] #('new_non_curated', 'BioModels non curated')] colors = sns.color_palette("Set1", 3) colors = [colors[1], colors[2], colors[0]] f, (ax1) = plt.subplots(1, 1, sharex=True, figsize=(6, 3.45)) f.tight_layout() for color, direct in zip(colors, directory): totalCount, bindingCount, modifyCount, atoarray = componentAnalysis(direct[0], 0.1) sns.kdeplot(totalCount, color=color, label=direct[1], shade=True, ax=ax1, clip=(0.4, 100), bw=0.2) #sns.distplot(bindingCount, color=color, ax=ax2, clip=(-0.1, 8), bw=0.5) #sns.distplot(modifyCount, color=color, ax=ax3, clip=(-0.1, 8), bw=0.5) plt.xlabel('Number of components', fontsize=22,fontweight='bold') #f.text(-0.14,0.5,'Model percentage', fontsize=22,fontweight='bold',va='center', rotation='vertical') ax1.set_title('Components/molecule') #ax2.set_title('Binding components/molecule') #ax3.set_title('Modification components/molecule') ax1.set_ylabel('Fraction',fontsize=22,fontweight='bold') #ax2.set_ylabel('Fraction',fontsize=22,fontweight='bold') #ax3.set_ylabel('Fraction',fontsize=22,fontweight='bold') plt.tight_layout() ax1.set(xlim=(0,10)) sns.despine() plt.savefig('componentDensity2.pdf',bbox_inches='tight')
def plot_galaxy_and_stars(galaxy, stars): colors = get_distinct(3) single_frame('X [kpc]', 'Y [kpc]') xlim = 10 pyplot.xlim(-xlim, xlim) pyplot.ylim(-xlim, xlim) ax = pyplot.gca() import numpy as np import pandas as pd from scipy import stats, integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) lim = 10|units.kpc p = galaxy.select(lambda x: x<lim,["x"]) p = p.select(lambda x: x>-lim,["x"]) p = p.select(lambda y: y<lim,["y"]) p = p.select(lambda y: y>-lim,["y"]) p = p.select(lambda r: r.length()>5|units.kpc,["position"]) x = p.x.value_in(units.kpc) y = p.y.value_in(units.kpc) sns.kdeplot(x, y, ax=ax, shade=True, n_levels=20, shade_lowest=False) m = 100*numpy.sqrt(stars.mass/stars.mass.max()) pyplot.scatter(stars.x.value_in(units.kpc), stars.y.value_in(units.kpc), c=colors[0], s=m, lw=0) pyplot.savefig("SolarSiblings_life_galaxy")
def mag_vs_length(): # bar = bar[(bar.kind == 'Composite')] # sns.lmplot(x='Mr', y='length_scaled', data=bar, hue='kind', palette=flatui, scatter_kws={'s': 9}, fit_reg=False, size=10).set(ylim=(0,1), xlim=(-18, -23)) for ax in range(1, 6): plt.subplot(2, 3, ax) sample = bar[bar.kind == kind[ax]] sns.kdeplot(sample.length_scaled, sample.Mr, cmap=sns.light_palette(color=flatui[1], as_cmap=True), shade=True, shade_lowest=True).set(xlim=(0, 1.1), ylim=(-18, -23), title=kind[ax])
def plot_averageAsOfLastMonth(df_train): sns.kdeplot(df_train['Past One Month'], shade=True, color='r') plt.title('Estimate of average days overdue as of past one month') plt.xlabel('Average Of days ovedue') plt.ylabel('Probability Distribution') fig = plt.gcf() plt.show() fig.savefig('graphs/past_one_month.png')
def plot_averageAsOfPastThreeMonths(df_train): sns.kdeplot(df_train['Past Three Months'], shade=True, color='purple') plt.title('Estimate of average days overdue as of past three months') plt.xlabel('Average of days overdue') plt.ylabel('Probabiity Distribution') fig = plt.gcf() plt.show() fig.savefig('graphs/past_three_months.png')
def plot_averageDaysOverdue(df_train): sns.kdeplot(df_train['Average Over Due Days'], shade=True, color='g') plt.title('Estimate of average days overdue as of December') plt.xlabel('Average of days overdue') plt.ylabel('Probability Distribution') fig = plt.gcf() plt.show() fig.savefig('graphs/average_overdue_days.png')
def createHist(self, event): # TODO avoid hardcoding sizes. Find smart way to decide on sizes dlg = GraphDialog(self.parent, "Histogram Input", ("Select Data",), size=(500,200)) # options hsize1 = wx.BoxSizer(wx.HORIZONTAL) bars = wx.CheckBox(dlg, label="Bars") density = wx.CheckBox(dlg, label="Density") bars.SetValue(True) density.SetValue(True) hsize1.Add(bars) hsize1.Add(density) dlg.Add(hsize1) numBins = dlg.AddSpinCtrl("# of Bins", 1, 999, np.sqrt(len(self.parent.data)), size=(50, -1)) bandwidth = dlg.AddSpinCtrl("Density Bandwidth", -99, 99, 0, size=(50, -1)) bars.Bind(wx.EVT_CHECKBOX, lambda e: numBins.Enable(bars.GetValue())) density.Bind(wx.EVT_CHECKBOX, lambda e: bandwidth.Enable(density.GetValue())) if dlg.ShowModal() == wx.ID_OK: ds = [d[0] for d in dlg.GetName()] # account for grouping groups, datas = dlg.GetValue(self.parent.data) bars, density = bars.GetValue(), density.GetValue() bandwidth = np.exp(-0.2 * bandwidth.GetValue()) if groups: ds = self._groupLabels(ds, groups) newDs = [] for d in ds: newDs += [d + "-" + str(g) for g in groups] ds = newDs dlg.Destroy() # d.min() gets minimum for each column. d.min.min() gets global min a, b = min(d.min().min() for d in datas), max(d.max().max() for d in datas) bins = np.arange(a, b, float(b-a) / numBins.GetValue()) for d, data in zip(ds, datas): data = data[data.columns[0]] d, data = d, data.astype(float) # astype float b/c of bug in seaborn. if bars and not density: plt.hist(data, bins=bins, alpha=1.0/len(ds), label=d) else: data = data[np.isfinite(data)] bw = stats.gaussian_kde(data).factor * bandwidth if density and not bars: sns.kdeplot(data, shade=True, label=d, bw=bw) else: sns.distplot(data, bins=bins, kde_kws={"bw":bw, "label":d}) plt.legend(loc='best') plt.show()
def FacetGrid(): sns.set_style("dark",{"axes.facecolor":"black"}) f, axes = plt.subplots(2,2, figsize=(12,8)) [Kde(i,axes) for i in range(0,2)] sns.violinplot(data=movies, x = 'Year', y='BudgetMillions', ax=axes[1,0],palette="YlOrRd") sns.kdeplot(movies.CriticRating,movies.AudienceRating,shade=True,shade_lowest=False,cmap='Blues_r',ax=axes[1,1]) sns.kdeplot(movies.CriticRating,movies.AudienceRating,cmap='gist_gray_r', ax=axes[1,1]) plt.gcf().canvas.set_window_title('Facet Grid') plt.show()
def vis_data(param_sets): """ Visualizes probability distribution """ data = np.array(param_sets) print len(data[:, [0, 1]]) data = pd.DataFrame(data[:, [0, 1]], columns=["X", "Y"]) sns.kdeplot(data.X, data.Y, shade=True) mpl.pyplot.show()
# another way to do the above #train_df['Age'].value_counts().sort_index().head(25) # In[ ]: # convert ages to ints age = train_df[['Age','Survived']].dropna() # returns a copy with blanks removed age['Age'] = age['Age'].astype(int) # floors floats # count passengers by age (smoothed via gaussian kernels) plt.subplots(figsize=(18,6)) plt.subplot(311) sns.kdeplot(age['Age'], shade=True, cut=0) # count passengers by age (no smoothing) plt.subplot(312) sns.countplot(x='Age', data=age, palette='GnBu_d') # survival rates by age plt.subplot(313) sns.barplot(x='Age', y='Survived', data=age, ci=None, palette='Oranges_d') # takes mean by default # Observations: # # - Under 16s tend to have the highest survival rates # - Very high survival rates at 53, 63 and 80 # - Survival of over 16s is fairly noisy. Possible that survival might increase with age.
f.savefig(os.path.join(impath, fname)) """ RT distribution """ f, axes = plt.subplots( n_tasks, 1, figsize=(10, 8), sharex=True, sharey=True, ) for i, condition in enumerate(CONDITIONS): temp = sns.kdeplot(RTs_cn[condition][~np.isnan(RTs_cn[condition])], shade=True, ax=axes[0]) for i, condition in enumerate(CONDITIONS): sns.kdeplot(RTs_wr[condition][~np.isnan(RTs_wr[condition])], shade=True, ax=axes[1]) axes[0].legend(CONDITIONS, frameon=False) for i, ax in enumerate(axes): ax.set_ylabel('Probability, KDE') ax.set_title(f'RT distribution, {TASKS[i]}') axes[1].set_xlabel('Reaction time') sns.despine() f.tight_layout() # save fig
chain_kg = trace_kg[1000:] varnames_kg = ['p'] pm.traceplot(chain_kg, varnames_kg) plt.show() with pm.Model() as model_ug: p = pm.Dirichlet('p', a=np.ones(clusters)) category = pm.Categorical('category', p=p, shape=n_total) means = pm.Normal('means', mu=[10, 20, 35], sd=2, shape=clusters) sd = pm.HalfCauchy('sd', 5) y = pm.Normal('y', mu=means[category], sd=sd, observed=mix) step1 = pm.ElemwiseCategorical(vars=[category], values=range(clusters)) step2 = pm.Metropolis(vars=[p]) trace_ug = pm.sample(10000, step=[step1, step2]) chain_ug = trace_ug[1000:] varnames_ug = ['means', 'sd', 'p'] pm.traceplot(chain_ug, varnames_ug) plt.show() ppc = pm.sample_ppc(chain_ug, 100, model_ug) for i in ppc['y']: sns.kdeplot(i, alpha=0.1, color='b') sns.kdeplot(np.array(mix), lw=2, color='k') plt.xlabel('$x$', fontsize=14) plt.show()
with open('output.json', 'r') as json_file: data = json.load(json_file) # Data Visulasation new_df = pd.DataFrame(data) corr = new_df.corr() plt.figure(figsize=(10, 7)) sns.heatmap(corr, annot=True) # Scatter plot between Hour and interactions fig, ax = plt.subplots(1, figsize=(12, 8)) sns.kdeplot(new_df.Hour, new_df.TotalInteractions, cmap='Blues', shade=True, thresh=0.05, clip=(-1, 300)) # Findeing the most interactions in the month of December new_df['day'] = pd.DatetimeIndex(new_df['date']).day daysforplot = new_df.groupby('day', as_index=False).agg({'TotalInteractions': 'sum'}) fig = px.scatter(daysforplot, x='day', y='TotalInteractions', color_continuous_scale='Rainbow', color='TotalInteractions', size='TotalInteractions', title='Most engaging days')
def plot_search(self, method, xy, ax): """ selected over the possible options a search algorithm Args: method: xy: ax: Returns: """ if self.plot_contour_xy: ax.collections = [] # TODO: Improve this ax = sns.kdeplot(self.x_list, self.y_list, ax=ax, color="red") self.trigger() if self.plot_xy: ax.plot(self.x_list, self.y_list, color=self.point_color, marker=self.marker, markersize=self.marker_size, linestyle=self.linestyle) if method == self.options[1]: # self.activate_frame_capture = True self.x_list = [] self.y_list = [] self.x_list, self.y_list = self.mcmc_random(xy, self.mesh) elif method == self.options[2]: # self.activate_frame_capture = False if self.x is None and self.y is None: self.x_list = [] self.y_list = [] if xy is not None: self.x, self.y = xy[0], xy[1] else: self.x, self.y = self.init_search self.x, self.y = self.mcmc_random_step(self.mesh, self.x, self.y, ax) elif method == self.options[3]: # self.activate_frame_capture = True self.x_list = [] self.y_list = [] self.x_list, self.y_list = self.mcmc_adaptiveMH(self.mesh) elif method == self.options[4]: # self.activate_frame_capture = False if self.x is None and self.y is None: self.x_list = [] self.y_list = [] if xy is not None: self.x, self.y = xy[0], xy[1] else: self.x, self.y = self.init_search self.x, self.y = self.mcmc_adaptiveMH_step(self.mesh, self.x, self.y, ax) elif method == self.options[5]: # self.activate_frame_capture = True self.x_list = [] self.y_list = [] self.x_list, self.y_list = self.mcmc_hamiltonianMC( self.mesh_hm, self.mesh_dx_hm, self.mesh_dy_hm) elif method == self.options[6]: # self.activate_frame_capture = False if self.x is None and self.y is None: self.x_list = [] self.y_list = [] if xy is not None: self.x, self.y = xy[0], xy[1] else: self.x, self.y = self.init_search self.x, self.y = self.mcmc_hamiltonianMC_step( self.mesh_hm, self.mesh_dx_hm, self.mesh_dy_hm, self.x, self.y, ax) else: return False return True
def run(output="output/"): X, Y, x, f, _ = make_data() Y = np.atleast_2d(Y).T plt.plot(X, Y, 'kx', mew=2) plt.savefig(os.path.join(output, "gpflow_input_data.png")) plt.show() plt.close() m1 = evalHandcrafted(X, Y) gp.gp_gpflow.plot(X, Y, x, m1, 'handcrafted GP model', f, output=os.path.join(output, "gpflow_handcrafted_model.png")) print(m1.as_pandas_table()) m1.clear() _, m2 = gp.gp_gpflow.evalMLE(X, Y) gp.gp_gpflow.plot(X, Y, x, m2, 'MLE-fitted model', f, output=os.path.join(output, "gpflow_mle.png")) print(m2.as_pandas_table()) # plot the function posterior plt.figure(figsize=(12, 6)) num_samples = 10 ff = m2.predict_f_samples(x, num_samples, initialize=False) plt.plot(np.stack([x[:, 0]] * num_samples).T, ff[:, :, 0].T, 'C0', lw=2, alpha=0.1) plt.plot(X, Y, 'kx', mew=2) _ = plt.xlim(x.min(), x.max()) plt.title('Posterior samples - MLE') plt.savefig(os.path.join(output, "gpflow_mle_posterior_samples.png")) plt.show() plt.close() m2.clear() traces, m3 = gp.gp_gpflow.evalMCMC(X, Y) gp.gp_gpflow.plot(X, Y, x, m3, 'MCMC-fitted model', f, output=os.path.join(output, "gpflow_mcmc.png")) print(m3.as_pandas_table()) fig = plt.figure(figsize=(8, 4)) cmap = matplotlib.cm.hot norm = matplotlib.colors.Normalize(vmin=0, vmax=traces.shape[1]) axs0 = plt.subplot2grid((1, 5), (0, 0), rowspan=1, colspan=1, fig=fig) j = 0 for i, col in traces.iteritems(): sns.kdeplot(col, ax=axs0, label=col.name, shade=True, vertical=True, color=cmap(norm(j))) j += 1 axs1 = plt.subplot2grid((1, 5), (0, 1), rowspan=1, colspan=4, fig=fig) j = 0 for i, col in traces.iteritems(): axs1.plot(col, label=col.name, color=cmap(norm(j))) j += 1 axs0.get_legend().remove() axs1.legend(loc=0) axs1.set_xlabel('HMC iteration') axs1.set_ylabel('parameter value') axs0.set_ylim(axs1.get_ylim()) axs0.set_xticks([]) plt.suptitle('HMC traces') plt.tight_layout() plt.savefig(os.path.join(output, "gpflow_mcmc_traces.png")) plt.show() plt.close() ################################### fig = plt.figure(figsize=(12, 4)) axs0 = plt.subplot2grid((3, 3), (0, 0), rowspan=2, colspan=1, fig=fig) axs0.plot(traces['GPR/likelihood/variance'], traces['GPR/kern/variance'], 'k.', alpha=0.15) axs0.set_xlabel('noise_variance') axs0.set_ylabel('signal_variance') axs01 = plt.subplot2grid((3, 3), (2, 0), rowspan=1, colspan=1, fig=fig) sns.distplot(traces['GPR/likelihood/variance'], color='m', ax=axs01) axs01.set_xlim(axs0.get_xlim()) plt.setp(axs01, yticks=[]) axs1 = plt.subplot2grid((3, 3), (0, 1), rowspan=2, colspan=1, fig=fig) axs1.plot(traces['GPR/kern/lengthscales'], traces['GPR/likelihood/variance'], 'k.', alpha=0.15) axs1.set_xlabel('lengthscale') axs1.set_ylabel('noise_variance') axs11 = plt.subplot2grid((3, 3), (2, 1), rowspan=1, colspan=1, fig=fig) sns.distplot(traces['GPR/kern/lengthscales'], color='m', ax=axs11) axs11.set_xlim(axs1.get_xlim()) plt.setp(axs11, yticks=[]) axs2 = plt.subplot2grid((3, 3), (0, 2), rowspan=2, colspan=1, fig=fig) axs2.plot(traces['GPR/kern/variance'], traces['GPR/kern/lengthscales'], 'k.', alpha=0.1) axs2.set_xlabel('signal_variance') axs2.set_ylabel('lengthscale') axs21 = plt.subplot2grid((3, 3), (2, 2), rowspan=1, colspan=1, fig=fig) sns.distplot(traces['GPR/kern/variance'], color='m', ax=axs21) axs21.set_xlim(axs2.get_xlim()) plt.setp(axs21, yticks=[]) fig.suptitle('HMC (joint) distribution') plt.tight_layout() plt.savefig(os.path.join(output, "gpflow_mcmc_joint_distribution.png")) plt.show() plt.close() # plot the function posterior plt.figure(figsize=(12, 6)) f_samples = [] nn = 1 # print("traces.shape=", traces.shape) # print("traces.iloc[::10].shape=", traces.iloc[::10].shape) # print("traces.iloc[::20].shape=", traces.iloc[::20].shape) for i, s in traces.iloc[::10].iterrows(): f = m3.predict_f_samples(x, nn, initialize=False, feed_dict=m3.sample_feed_dict(s)) f_samples.append(f) plt.plot(np.stack([x[:, 0]] * nn).T, f[:, :, 0].T, 'C0', lw=2, alpha=0.02) f_samples = np.array(f_samples) line, = plt.plot(x, np.mean(f_samples, axis=(0, 1)), lw=2) plt.fill_between(x[:, 0], np.percentile(f_samples, 5, axis=(0, 1, 3)), np.percentile(f_samples, 95, axis=(0, 1, 3)), color=line.get_color(), alpha=0.1) plt.plot(X, Y, 'kx', mew=2) _ = plt.xlim(x.min(), x.max()) # _ = plt.ylim(0, 6) plt.title('Posterior samples - MCMC') plt.savefig(os.path.join(output, "gpflow_mcmc_posterior_samples.png")) plt.show() plt.close() m3.clear()
import matplotlib.pyplot as plt import seaborn as sns student_table = pd.read_csv('StudentsPerformance (1).csv') print("COLUMNS: ") print(student_table.columns.tolist()) print(student_table.gender.unique()) print(student_table['race/ethnicity'].unique()) print(student_table['parental level of education'].unique()) print(student_table['lunch'].unique()) print(student_table['test preparation course'].unique()) print(student_table.info()) #print(sns.distplot(student_table[['math score','reading score','writing score']])) #sns.distplot(student_table['math score'],bins=11,hist_kws=dict(edgecolor='yellow',linewidth=3,color='green')) #sns.distplot(student_table['reading score'],bins=11,hist_kws=dict(edgecolor='yellow',linewidth=3,color='green')) print(sns.kdeplot(student_table['math score'], shade=True)) print(sns.kdeplot(student_table['reading score'], shade=True)) print(sns.kdeplot(student_table['writing score'], shade=True)) # **OBSERVATIONS** # # * The table has 8 columns and 1000 rows with three int64, five object data types and no null values. # * There are 3 numerical and 5 categorical data # * The Sns kdeplot helps to get a view of the three distributions i.e Math score, reading score and writing score. # In[6]: print(student_table.describe()) plt.rcParams['figure.figsize'] = (30, 20) sns.countplot(student_table['math score'], palette='dark') plt.title('Math Score', fontsize=25)
array = np.delete(array, list(array).index(i)) titles = [r'$h = \frac{h_n}{2}$', r'$h = h_n$', r'$h = 2 * h_n$'] l = 0 fig, ax = plt.subplots(1, 3) plt.subplots_adjust(wspace=0.5) for bandwidth in [0.5, 1, 2]: kde = stats.gaussian_kde(array, bw_method='silverman') h_n = kde.factor fig.suptitle('Normal, n = ' + str(quan_of_numbers[k - 1])) ax[l].plot(array_global, stats.norm.pdf(array_global, 0, 1), color='blue', alpha=0.5, label='density') ax[l].set_title(titles[l]) sns.kdeplot(array, ax=ax[l], bw=h_n * bandwidth, label='kde') ax[l].set_xlabel('x') ax[l].set_ylabel('f(x)') ax[l].set_ylim([0, 1]) ax[l].set_xlim([-4, 4]) ax[l].legend() l += 1 plt.show() k += 1 array_20 = np.random.standard_cauchy(20) array_60 = np.random.standard_cauchy(60) array_100 = np.random.standard_cauchy(100) arrays = [array_20, array_60, array_100] j = 1 array_global = np.arange(-4, 4, 0.01)
Hops = [np.array([nx.shortest_path_length(synth_net,n,sub) \ for n in list(synth_net.nodes())]), np.array([nx.shortest_path_length(tree,n,sub) \ for n in list(tree.nodes())])] Dist = [np.array([nx.shortest_path_length(synth_net,n,sub,weight='geo_length') \ for n in list(synth_net.nodes())])*1e-3, np.array([nx.shortest_path_length(tree,n,sub,weight='geo_length') \ for n in list(tree.nodes())])*1e-3] import matplotlib.pyplot as plt import seaborn as sns col = ['r', 'g', 'b'] fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) sns.kdeplot(Hops[0], shade=False, color='r', label='Optimal network') sns.kdeplot(Hops[1], shade=False, color='g', label='Random network') ax.set_ylabel('Percentage of nodes', fontsize=20) ax.set_xlabel('Hops from root node', fontsize=20) ax.set_title("Hop distribution", fontsize=20) ax.legend(loc='best', ncol=1, prop={'size': 20}) labels = ax.get_yticks() ax.set_yticklabels(["{:.1f}".format(100.0 * i) for i in labels]) fig.savefig("{}{}.png".format(figpath + suffix, 'hopcomp'), bbox_inches='tight') fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) sns.kdeplot(Dist[0], shade=False, color='r', label='Optimal network') sns.kdeplot(Dist[1], shade=False, color='g', label='Random network') ax.set_ylabel('Percentage of nodes', fontsize=20)
v2 = pd.Series(2 * v1 + np.random.normal(60, 15, 1000), name='v2') # In[ ]: plt.figure() plt.hist(v1, alpha=0.7, bins=np.arange(-50, 150, 5), label='v1') plt.hist(v2, alpha=0.7, bins=np.arange(-50, 150, 5), label='v2') plt.legend() # In[ ]: # plot a kernel density estimation over a stacked barchart plt.figure() plt.hist([v1, v2], histtype='barstacked', normed=True) v3 = np.concatenate((v1, v2)) sns.kdeplot(v3) # In[ ]: plt.figure() # we can pass keyword arguments for each individual component of the plot sns.distplot(v3, hist_kws={'color': 'Teal'}, kde_kws={'color': 'Navy'}) # In[ ]: sns.jointplot(v1, v2, alpha=0.4) # In[ ]: grid = sns.jointplot(v1, v2, alpha=0.4) grid.ax_joint.set_aspect('equal')
for wi, w in enumerate(which_plots): plt.close('all') fig, axes = plt.subplots(1, 2, figsize=(FIGURE_WIDTH / 2, FIGURE_HEIGHT), sharex=True, sharey=True) for task, taskname, ax in zip(['traini', 'biased'], ['Basic task', 'Full task'], axes): # bivariate KDE sns.kdeplot(data=history_shift[(history_shift.task == task)].dropna( subset=[w[0], w[1]])[w[0]], data2=history_shift[(history_shift.task == task)].dropna( subset=[w[0], w[1]])[w[1]], shade=True, shade_lowest=False, cmap='Greys', ax=ax) # individual points sns.lineplot(x=w[0], y=w[1], units='subject_nickname', estimator=None, color='black', alpha=0.3, data=history_shift[(history_shift.task == task)], marker='o', ax=ax, legend=False,
# Configure the test data set with anomalous employment dates app_test['DAYS_EMPLOYED_ANOM'] = app_test['DAYS_EMPLOYED'] == 365243 app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True) print('There are %d anomalies in the test data out of %d entries' % (app_test['DAYS_EMPLOYED_ANOM'].sum(), len(app_test))) # Find correlations with the target and sort correlations = app_train.corr()['TARGET'].sort_values() # Display correlations print('Most Positive Correlations:\n', correlations.tail(15)) print('\nMost Negative Correlations:\n', correlations.head(15)) # Find the correlation of the positive days since birth and target app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH']) app_train['DAYS_BIRTH'].corr(app_train['TARGET']) plt.style.use('fivethirtyeight') # Plot the distribution of ages in years plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25) plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count'); plt.figure(figsize = (10,8)) # KDE plot of loans that were repaid on time sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH']/365, label = 'target ==0')
plt.hist2d(x, y, bins=(15, 15), cmap=plt.cm.jet) plt.xlabel("amygdala") plt.ylabel("acc") plt.title("2-D Histogram") plt.colorbar() plt.show() ##################################################### # KDE Contour Plot col = math.ceil(np.sqrt(len(ranges))) row = math.ceil(np.sqrt(len(ranges))) fig, ax = plt.subplots(figsize=(10, 10), ncols=col, nrows=row) for i in range(len(ranges)): ax[int(i / col)][i % col].set_title("Orientation " + str(ranges[i])) sns.kdeplot(dataset[i]["amygdala"], dataset[i]["acc"], ax=ax[int(i / col)][i % col]) plt.show() ### alternative method to get the plot ### use the equations provided in the problem rather than packages def gaussian_kernel(x, y): return math.exp(-((x**2) + (y**2)) / 2) / (math.sqrt(2 * math.pi)) def kernel_density_estimate(x_i, y_i, x, y, h): prob = 0 m = len(x) for i in range(m): prob += gaussian_kernel((x_i - x[i]) / h, (y_i - y[i]) / h)
ax = kc_tax0.plot.hexbin(x='SqFtTotLiving', y='TaxAssessedValue', gridsize=30, sharex=False, figsize=(5, 4)) ax.set_xlabel('Finished Square Feet') ax.set_ylabel('Tax Assessed Value') plt.tight_layout() plt.show() # The _seaborn_ kdeplot is a two-dimensional extension of the density plot. fig, ax = plt.subplots(figsize=(4, 4)) ax = sns.kdeplot(kc_tax0.SqFtTotLiving, kc_tax0.TaxAssessedValue, ax=ax) ax.set_xlabel('Finished Square Feet') ax.set_ylabel('Tax Assessed Value') plt.tight_layout() plt.show() ### Two Categorical Variables # Load the `lc_loans` dataset lc_loans = pd.read_csv(LC_LOANS_CSV) # Table 1-8(1) crosstab = lc_loans.pivot_table(index='grade', columns='status', aggfunc=lambda x: len(x),
params, train_data, 150, #early_stopping_rounds= 40, verbose_eval=4) #Predict on test set predictions_lgbm_prob = lgbm.predict(valid_early_x) auc_lgb = roc_auc_score(valid_early_y, predictions_lgbm_prob) print('AUC LGBM: {}'.format(auc_lgb)) #Ensemble and predict on test set print('Predict on test set...') test_pred_rf = clf_rf.predict_proba(test_df)[:, 1] test_pred_ridge = clf_ridge.predict(test_df) test_pred_gbm = lgbm.predict(test_df) submission = pd.read_csv('sample_submission.csv') submission['loan_default'] = (test_pred_ridge + test_pred_gbm + test_pred_rf) / 3 submission.to_csv('submission_oof_ensemble_1.csv', index=False) #plots import matplotlib.pyplot as plt import seaborn as sns sns.kdeplot(test_pred_ridge, label='ridge') #sns.kdeplot(test_pred_rf, label = 'rf') sns.kdeplot(test_pred_gbm, label='gbm') #sns.kdeplot(submission['loan_default'].values, label = 'ensemble')
else: return 1 data["Sts_Val"] = data.apply(sts_val, axis=1) data # <<< BMI Report generation >>> data["Gender"].value_counts() data["Status"].value_counts() sns.jointplot(x='', y="Weight", data=data, kind="kde") sns.kdeplot(data=data['Sts_Val'], data2=data["Weight"]) sns.barplot(x='Sts_Val', y='Weight', data=data, hue="Gender") sns.countplot(x='Gender', data=data, hue='Sts_Val') sns.boxplot(x='Sts_Val', y='Weight', data=data, hue='Gender') sns.violinplot(x='Sts_Val', y='Weight', data=data, hue='Gender') sns.stripplot(x='Sts_Val', y='Height', data=data, hue='Gender', dodge=True) sns.catplot(x='Sts_Val', y='Height', data=data, hue='Gender', col='Gender') sns.set_style('whitegrid') sns.lmplot(
import pandas as pd import numpy as np import matplotlib as plt import seaborn as sns df = pd.read_csv( "C:\\Users\\Arun\Documents\\shanu\\kaggle\\googleplaystore.csv") df.describe() df.info() df.isna().sum().sort_values(ascending=False) df.dropna(how="any", inplace=True) df.isna().sum().sort_values(ascending=False) sns.kdeplot(df["Rating"], legend=True) plt.show() # Rating ranges between 4 and 5 and so many have given it df["Rating"].mean() sns.kdeplot(df["Rating"], legend=True) plt.show()
a = po.get_action(s) # env.render() s_, r, done, _ = env.step(a * high[0]) s_list.append(s) s = s_ if done: game_num += 1 break if game_num >= 500: for state_index in range(obs_dim): this_state = np.array([state[state_index] for state in s_list]) ax.hist(this_state, bins=100, histtype="stepfilled", normed=True, alpha=0.6) sns.kdeplot(this_state, shade=True) plt.savefig("./{}_distribute/state[{}].jpg".format(policy_type, state_index)) plt.close() break
sb32.py Ref: https://seaborn.pydata.org/examples/index.html https://seaborn.pydata.org/examples/cubehelix_palette.html """ import numpy as np import seaborn as sns import matplotlib.pyplot as plt sns.set(style="dark") rs = np.random.RandomState(50) # Set up the matplotlib figure f, axes = plt.subplots(3, 3, figsize=(9, 9), sharex=True, sharey=True) # Rotate the starting point around the cubehelix hue circle for ax, s in zip(axes.flat, np.linspace(0, 3, 10)): # Create a cubehelix colormap to use with kdeplot cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True) # Generate and plot a random bivariate dataset x, y = rs.randn(2, 50) sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=ax) ax.set(xlim=(-3, 3), ylim=(-3, 3)) f.tight_layout() plt.show()
'promotion_last_5years': 'promotion', 'sales': 'department', 'left': 'turnover' }) front = df['turnover'] df.drop(labels=['turnover'], axis=1, inplace=True) df.insert(0, 'turnover', front) corr = df.corr() sns.heatmap(data=corr, yticklabels=corr.columns.values, xticklabels=corr.index.values) plt.show() fig = plt.figure(figsize=(15, 4)) sns.kdeplot(x='satisfaction', data=df, hue='turnover', shade=True) plt.show() from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, \ precision_recall_curve, roc_auc_score df['department'] = df['department'].astype('category').cat.codes df['salary'] = df['salary'].astype('category').cat.codes target_name = 'turnover' x = df.drop('turnover', axis=1) y = df[target_name] x_train, x_test, y_train, y_test = train_test_split(x,
def __getPlots2(self, fig, axes, color2): ttc = [] yyc = [] erc = [] yyem = [] ttem = [] erem = [] for j in range(self.numLC): rr = self.mjds[j] - self.mjds[j].min() mx = np.int(np.ceil(rr.max())) + 1 t, y, e = LC_opsim(self.mjds[j], self.t[150:mx + 150], self.y[150:mx + 150, j], self.greska2[150:mx + 150, j]) ttc.append(t) yyc.append(y) erc.append(e) t, y, e = LC_opsim(self.mjds[j], self.tp[150:mx + 150], self.response[150:mx + 150, j], self.greska2e[150:mx + 150, j]) ttem.append(t) yyem.append(y) erem.append(e) import statistics # https://www.aanda.org/articles/aa/full_html/2013/11/aa21781-13/aa21781-13.html fvarc = [] fvarem = [] meanerc = [] meanerm = [] for j in range(self.numLC): tc = ttc[j] c = yyc[j] erc1 = erc[j] stdc2 = np.std(c**2) erc2m = np.mean(erc1) ercm = 100 * np.mean(erc1) / np.mean(c) te = ttem[j] em = yyem[j] erm = erem[j] erm2m = np.mean(erm) ermm = 100 * np.mean(erm) / np.mean(em) stdem2 = np.std(em**2) meanerc.append(100 * np.mean(erc1 / c)) meanerm.append(100 * np.mean(erm / em)) fvarc.append(np.sqrt(np.std(c**2) - erc2m) / (np.mean(c))) fvarem.append(np.sqrt(np.std(em**2) - erm2m) / (np.mean(em))) caden = [] brojposm = [] for j in range(self.numLC): tc = self.mjds[j] caden.append(np.mean(np.diff(tc))) brojposm.append(len(self.mjds[j])) zz = 0.05 lags = np.asarray(self.lags) fvarc = np.asarray(fvarc) meanerc = np.asarray(meanerc) caden = np.asarray(caden) xx = np.array(fvarc) / np.array(meanerc) yy = lags / ((1 + zz) * caden) zzcrt = -3.356 * xx - 0.2638 * yy zzzcrtred = (-0.002415) * xx - 3.97756 * yy sns.kdeplot(zzzcrtred, shade=None, ax=axes, alpha=0.3, label='filter ' + self.fil, color=color2) kdeline1 = axes.lines[0] xs1 = kdeline1.get_xdata() ys1 = kdeline1.get_ydata() xp = np.linspace(xx.min(), xx.max(), 50) yp = np.linspace(yy.min(), yy.max(), 50) xxx, yyy = np.meshgrid(xp, yp) zzz = (-0.002415) * xxx - 3.97756 * yyy
label='ground truth', bins=150) plt.xlabel('Partial charge', fontsize=fontsize_label_legend, **hfont) plt.ylabel('No of atoms', fontsize=fontsize_label_legend, **hfont) plt.legend(frameon=False, prop={"family": "Times New Roman", 'size': fontsize_label_legend}) plt.tick_params(axis='both', which='major', labelsize=17) plt.savefig('results/graphs/ground_distplot.png', format='png', dpi=300, bbox_inches="tight") plt.show() # ------------------------------------------------------------------- # histogram of prediction and ground truth colors = ['green', 'dodgerblue', 'deeppink'] plt.figure(figsize=(8, 8), dpi=80) sns.kdeplot(label.cpu().numpy(), shade=True, color="orange", label="ground truth", alpha=.7) sns.kdeplot(pred.cpu().numpy(), shade=True, color=colors[index], label=system, alpha=.7) plt.xlabel('Partial charge', fontsize=fontsize_label_legend, **hfont) plt.ylabel('No of atoms', fontsize=fontsize_label_legend, **hfont) plt.legend(frameon=False, prop={"family": "Times New Roman", 'size': fontsize_label_legend}) plt.tick_params(axis='both', which='major', labelsize=17) plt.savefig('results/graphs/ground_{}_histogram.png'.format(system), format='png', dpi=300, bbox_inches="tight") plt.show() # ------------------------------------------------------------------- # # saving mean sigmas of elements # element_types_labels = np.zeros(len(label)) # for element_index in range(elements_number):
def train_and_predict(csv_file, build_new=True): ''' Build and train a new model or continue training a saved model. Includes density plots of distances between the images of positive and negative pairs before and after training for a first sanity and consistency check. ''' X_train, X_val, X_test, y_train, y_val, y_test = split_pairdata(csv_file) pos_pairs = np.concatenate((X_train[y_train==1], X_val[y_val==1], X_test[y_test==1])) neg_pairs = np.concatenate((X_train[y_train==0], X_val[y_val==0], X_test[y_test==0])) if build_new: model = siam_cnn() optimizer = RMSprop() model.compile(loss=contrastive_loss, optimizer=optimizer) print("Model compiled.") else: model = load_model('models/modelxx.h5', custom_objects={'contrastive_loss': contrastive_loss}) print('Model loaded.') untrained_pred_pos = model.predict([pos_pairs[:,0], pos_pairs[:,1]]) untrained_pred_neg = model.predict([neg_pairs[:,0], neg_pairs[:,1]]) #Density plot of distances before training print('Plotting density of distances.. (please exit plot window to continue.)') plt.figure(figsize=(4,4)) plt.xlabel('Distance') plt.ylabel('Frequency') sns.kdeplot(untrained_pred_neg[:,0], shade=True, color='red', label='Distant pairs') sns.kdeplot(untrained_pred_pos[:,0], shade=True, color='green', label='Close pairs') plt.legend(loc=1) #plt.savefig('untrained_pred.png') plt.show() print('Begin training...') model.fit([X_train[:,0], X_train[:,1]], y_train, validation_data = ([X_val[:,0], X_val[:,1]], y_val), batch_size=128, nb_epoch=10) time.sleep(3) print('Training finished.') #print('Saving model..') #model.save('models/best_model.h5') #print('Model saved.') trained_pred_pos = model.predict([pos_pairs[:,0], pos_pairs[:,1]]) trained_pred_neg = model.predict([neg_pairs[:,0], neg_pairs[:,1]]) #Density plot of distances after training print('Plotting density of distances.. (please exit plot window to continue.)') plt.figure(figsize=(4,4)) plt.xlabel('Distance') plt.ylabel('Frequency') sns.kdeplot(trained_pred_neg[:,0], shade=True, color='red', label='Distant pairs') sns.kdeplot(trained_pred_pos[:,0], shade=True, color='green', label='Close pairs') plt.legend(loc=1) #plt.savefig('trained_pred.png') plt.show() y_pred = model.predict([X_test[:,0], X_test[:,1]]) return y_test, y_pred
Multiple bivariate KDE plots ============================ _thumb: .6, .4 """ import seaborn as sns import matplotlib.pyplot as plt sns.set(style="darkgrid") iris = sns.load_dataset("iris") # Subset the iris dataset by species setosa = iris.query("species == 'setosa'") virginica = iris.query("species == 'virginica'") # Set up the figure f, ax = plt.subplots(figsize=(8, 8)) ax.set_aspect("equal") # Draw the two density plots ax = sns.kdeplot(setosa.sepal_width, setosa.sepal_length, cmap="Reds", shade=True, shade_lowest=False) ax = sns.kdeplot(virginica.sepal_width, virginica.sepal_length, cmap="Blues", shade=True, shade_lowest=False) # Add labels to the plot red = sns.color_palette("Reds")[-2] blue = sns.color_palette("Blues")[-2] ax.text(2.5, 8.2, "virginica", size=16, color=blue) ax.text(3.8, 4.5, "setosa", size=16, color=red)
def plot_kde(train, test_A, test_B, test_C, col): fig, ax = plt.subplots(1, 5) sns.kdeplot(train[col], color='g', ax=ax[0]) sns.kdeplot(test_A[col], color='r', ax=ax[1]) sns.kdeplot(test_B[col], color='y', ax=ax[2]) sns.kdeplot(test_C[col], color='m', ax=ax[3]) sns.kdeplot(train[col], color='g', ax=ax[4]) sns.kdeplot(test_A[col], color='r', ax=ax[4]) sns.kdeplot(test_B[col], color='y', ax=ax[4]) sns.kdeplot(test_C[col], color='m', ax=ax[4]) plt.title('Distribution_' + col) plt.show()
g = g.set_ylabels("survival probability") # %% Parch g = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar", size = 6 , palette = "muted") g.despine(left=True) g = g.set_ylabels("survival probability") # %% Age g = sns.FacetGrid(train, col='Survived') g = g.map(sns.distplot, "Age") # %% Explore Age distribution g = sns.kdeplot(train["Age"][(train["Survived"] == 0) & (train["Age"].notnull())], color="Red", shade = True) g = sns.kdeplot(train["Age"][(train["Survived"] == 1) & (train["Age"].notnull())], ax =g, color="Blue", shade= True) g.set_xlabel("Age") g.set_ylabel("Frequency") g = g.legend(["Not Survived","Survived"]) # %% Fare dataset['Fare'].isnull().sum() # %% dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].median()) # Explore Fare distribution g = sns.distplot(dataset["Fare"], color="m", label="Skewness : %.2f"%(dataset["Fare"].skew()))
# # However, I will try to fix *orientation_X* and *orientation_Y* as I explained before, scaling and normalizing data. # # --- # # ### Now with a new scale (more more precision) # In[ ]: plt.figure(figsize=(26, 16)) for i, col in enumerate(aux.columns[3:13]): ax = plt.subplot(3, 4, i + 1) ax = plt.title(col) for surface in classes: surface_feature = aux[aux['surface'] == surface] sns.kdeplot(surface_feature[col], label=surface) # ### Histogram for main features # In[ ]: plt.figure(figsize=(26, 16)) for i, col in enumerate(data.columns[3:]): ax = plt.subplot(3, 4, i + 1) sns.distplot(data[col], bins=100, label='train') sns.distplot(test[col], bins=100, label='test') ax.legend() # ## Step 0 : quaternions # Orientation - quaternion coordinates
import seaborn as sns import matplotlib.pyplot as plt # sns.set(style="white", color_codes=True) # grid = sns.JointGrid(X_embedded[:,0], X_embedded[:,1], space=0, size=6, ratio=50) # grid.plot_joint(plt.scatter, color="g") # grid.plot_marginals(sns.rugplot, height=1, color="g") # sns.set(style="darkgrid") f, ax = plt.subplots(figsize=(8, 8)) ax.set_aspect("equal") # Draw the two density plots ax = sns.kdeplot(item_embedded[:,0], item_embedded[:,1], cmap="Reds", shade=True, shade_lowest=False) ax = sns.kdeplot(user_embedded[:,0],user_embedded[:,1], cmap="Blues", shade=True, shade_lowest=False) import gc import numpy as np import pandas as pd import os from sltools import load_pickle from scipy.sparse import vstack from scipy.sparse import csr_matrix from scipy.sparse import diags from scipy.sparse import coo_matrix
# In[72]: # 检查各个数据集的分布和相关性 data_ = np.vstack([ feature_train.mean(axis=0), feature_validation.mean(axis=0), feature_test.mean(axis=0) ]) fig, axs = plt.subplots(1, 2, figsize=(10, 5)) sns.heatmap(np.corrcoef(data_), annot=True, ax=axs[0]) axs[0].axis('equal') axs[0].set_xticklabels( ["Trainning dataset", "Validation dataset", "Test dataset"], rotation=45) axs[0].set_yticklabels( ["Trainning dataset", "Validation dataset", "Test dataset"], rotation=0) sns.kdeplot(data_[0, :], ax=axs[1]) sns.kdeplot(data_[1, :], ax=axs[1]) sns.kdeplot(data_[2, :], ax=axs[1]) plt.title("Distribution", fontsize=20) plt.legend(["Training dataset", "Validation dataset", "Test dataset"]) plt.subplots_adjust(wspace=1) plt.show() # In[73]: # 规范化数据 scaler = StandardScaler() feature_train_ = scaler.fit_transform(feature_train) feature_validation_ = scaler.transform( feature_validation) # 对验证集和测试集在规范化时,要用训练集的参数
return 'virginica' #terget_df = target.apply(rename,axis= 1) ##iris = pd.concat([iris,tar],axis=1) sns.set_style('whitegrid') #setosa = iris[iris['target']==0] #sns.pairplot(iris_sns,hue= 'species', palette='Dark2') sns.plt.show() setosa = iris_sns[iris_sns['species'] == 'setosa'] sns.kdeplot(setosa['sepal_width'], setosa['sepal_length'], cmap="plasma", shade=True, shade_lowest=False) #sns.plt.show() #sns.kdeplot(iris_df['sepal width (cm)'],iris_df['sepal length (cm)'], cmap='Blues',shade=True, shade_lowest=False) #sns.kdeplot(setosa.sepal_width, setosa.sepal_length,cmap="Reds", shade=True, shade_lowest=False) #sns.plt.show() X = iris_sns.drop('species', axis=1) y = iris_sns['species'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) svc_model = SVC() svc_model.fit(X_train, y_train)