def get_samples(depends_on, includes, burn): """Get all samples from all runs.""" paths = make_paths(depends_on, includes, 0) sample_files = [f for f in ld(paths['model_dir']) if 'samples_' in f] dfs = [ pd.read_csv(pj(paths['model_dir'], f), index_col=0).ix[1000:] for f in sample_files ] df = pd.concat(dfs, axis=0) for vcol in [c for c in df.columns if 'v_' in c]: df[vcol] = -df[vcol] for i, iv in enumerate(['age', 'group', 'group:age', 'Intercept'], 1): plt.subplot(2, 2, i) if iv == 'Intercept': _df = df[['%s_%s' % (p, iv) for p in 'atv']] else: _df = df[['%s_%s' % (p, iv) for p in depends_on]] sns.violinplot(data=_df) plt.hlines(0, 0, 3) plt.show() return df
def figure2_1(): global figureIndex plt.figure(figureIndex) figureIndex += 1 sns.violinplot(data=np.random.randn(200,10) + np.random.randn(10)) plt.xlabel("Action") plt.ylabel("Reward distribution")
def variance_plot(self, df, item): # Pass in entire df since need col name either eay df = df.xs(item) # Set up plot fig = sns.plt.figure() #fig.set_size_inches(10,10) # Plot Data sns.violinplot(df, bw=.15, cut=0) # TODO: fix sns.boxplot(df, linewidth=2) #sns.plt.plot(base_line * np.ones(len(df)+2), 'red', lw=2) # Plot Settings sns.plt.yticks(fontsize=12) sns.plt.ylabel(self.y_LUT[item], fontsize=16) #sns.plt.tight_layout() x_labels = range(len(df.columns)) ax = sns.plt.subplot() ax.set_xticklabels(x_labels) # Plot info webpage title = item + ' Variance after Convergence' text = pd.DataFrame( df.columns.tolist(), columns=df.columns.names, index=x_labels ).transpose() info = {'data': fig, 'title': title, 'text': text} return info
def plot_distances(distance_data, filename, title, plot_variable='distance'): seeds = sorted(set(distance_data['region'])) distance_data = distance_data.sort_values(['region', 'cutoff']) sns.set() num_plots = len(seeds) figure, axes_sets = plt.subplots(nrows=num_plots, ncols=1, squeeze=False) axes_sets = list(chain(*axes_sets)) # 2-dim array -> 1-dim list for ax, seed in zip(axes_sets, seeds): seed_data = distance_data[distance_data['region'] == seed] seed_data = seed_data.assign( count=lambda df: df['cutoff'].map( df.groupby(by=['cutoff'])[plot_variable].count())) seed_data['cutoff_n'] = seed_data.apply(format_cutoff, 'columns') sns.violinplot(x='cutoff_n', y=plot_variable, data=seed_data, cut=0, alpha=0.7, ax=ax) plt.setp(ax.lines, zorder=100) plt.setp(ax.collections, zorder=100) sns.swarmplot(x='cutoff_n', y=plot_variable, data=seed_data, color='k', ax=ax) ax.set_ylabel(seed + '\n' + plot_variable) axes_sets[0].set_title(title) plt.savefig(filename)
def show_results(self): import seaborn as sns import matplotlib.pyplot as plt import pandas as pd plt.hold(True) sns.set(style="whitegrid", palette="pastel", color_codes=True) plt.figure(figsize=(35, 20)) data_dist = {"distances": [], "image": [], "slice": []} if self.dim_im == 2: data_dist["distances"].append([dist * self.dim_pix for dist in self.dist1_distribution]) data_dist["image"].append(len(self.dist1_distribution) * [1]) data_dist["slice"].append(len(self.dist1_distribution) * [0]) data_dist["distances"].append([dist * self.dim_pix for dist in self.dist2_distribution]) data_dist["image"].append(len(self.dist2_distribution) * [2]) data_dist["slice"].append(len(self.dist2_distribution) * [0]) if self.dim_im == 3: for i in range(len(self.distances)): data_dist["distances"].append([dist * self.dim_pix for dist in self.dist1_distribution[i]]) data_dist["image"].append(len(self.dist1_distribution[i]) * [1]) data_dist["slice"].append(len(self.dist1_distribution[i]) * [i]) data_dist["distances"].append([dist * self.dim_pix for dist in self.dist2_distribution[i]]) data_dist["image"].append(len(self.dist2_distribution[i]) * [2]) data_dist["slice"].append(len(self.dist2_distribution[i]) * [i]) for k in data_dist.keys(): # flatten the lists in data_dist data_dist[k] = [item for sublist in data_dist[k] for item in sublist] data_dist = pd.DataFrame(data_dist) sns.violinplot(x="slice", y="distances", hue="image", data=data_dist, split=True, inner="point", cut=0) plt.savefig('violin_plot.png')
def biplot() : #read in all the input data cpdtr = pd.read_csv("./Data/coupon_detail_train.csv") cpltr = pd.read_csv("./Data/coupon_list_train.csv") cplte = pd.read_csv("./Data/coupon_list_test.csv") ulist = pd.read_csv("./Data/user_list.csv") # Merge detail with user m = pd.merge(cpdtr, ulist, left_on = "USER_ID_hash", right_on = "USER_ID_hash") m = pd.merge(m, cpltr, left_on = "COUPON_ID_hash", right_on = "COUPON_ID_hash") import seaborn as sns sns.violinplot(x="AGE", y="CATALOG_PRICE", hue="SEX_ID", data=m) # plt.figure(1) # plt.scatter(m["CATALOG_PRICE"][(m["SEX_ID"] == "m").values], m["DISCOUNT_PRICE"][(m["SEX_ID"] == "m").values], c="r", label = "male", alpha = 0.5) # plt.scatter(m["CATALOG_PRICE"][(m["SEX_ID"] == "f").values], m["DISCOUNT_PRICE"][(m["SEX_ID"] == "f").values], c="b", label = "female", alpha = 0.5) # plt.legend() # plt.title("Nope") # plt.figure(2) # plt.scatter(m["AGE"][(m["SEX_ID"] == "f").values], m["CATALOG_PRICE"][(m["SEX_ID"] == "f").values], c="b", label = "female", alpha = 0.5) # plt.scatter(m["AGE"][(m["SEX_ID"] == "m").values], m["CATALOG_PRICE"][(m["SEX_ID"] == "m").values], c="r", label = "male", alpha = 0.5) # plt.legend() # plt.title("Nope") plt.show() raw_input()
def plot_hist_algo(wave_hist_algor, pulse_hist_algor, multi_wave_hist_algor): inch_factor = 2.54 sns.set_context("poster") sns.axes_style('white') # sns.set_style("ticks") fig4= plt.figure(figsize=(35./ inch_factor, 20./ inch_factor)) ax1 = fig4.add_subplot(2, 3, (1, 4)) dafr = pd.DataFrame([wave_hist_algor, multi_wave_hist_algor, pulse_hist_algor]) #turn dafr = dafr.transpose() dafr.columns = ['wave', 'multi-wave', 'pulse'] sns.violinplot(data=dafr, ax=ax1, col=("blue", "green", "red")) ax1.set_ylabel('psd_proportion') ax1.set_xlabel('EOD-type') ax1.set_title('Fishsorting based on PSD') wave_psd_data = np.load('wave_psd_data.npy') wave_hist_data = wave_psd_data[1][:len(wave_psd_data[0][wave_psd_data[0]<1500])] ax3 = fig4.add_subplot(2, 3, (2, 5)) n, bin, patch = ax3.hist(wave_hist_data, 50, color='blue', alpha=0.7, normed=True) # ax3.set_ylim([0, max(n)+10]) ax3.set_ylabel('counts in histogram bin') ax3.set_xlabel('amplitude of PSD') ax3.set_title('Histogram of pulsefish PSD') pulse_psd_data = np.load('pulse_psd_data.npy') pulse_hist_data = pulse_psd_data[1][:len(pulse_psd_data[0][pulse_psd_data[0]<1500])] ax2 = fig4.add_subplot(2, 3, (3, 6)) ax2.hist(pulse_hist_data, 50, color='red', alpha=0.7, normed=True) # ax2.set_ylim([0, max(n)+10]) ax2.set_ylabel('counts in histogram bin') ax2.set_xlabel('amplitude of PSD') ax2.set_title('Histogram of pulsefish PSD') fig4.tight_layout()
def view_distribution(df,x="type",y="rate", plt=plt): asset = df.symbol.values[0] plt.figure(1,figsize=(15,15)) sns.violinplot(x=x, y=y, data=df, inner=None) sns.stripplot(x=x, y=y, data=df, jitter=True, color="white", edgecolor="gray") plt.title(y+' distribution ('+asset+')') plt.show()
def build_image(self): fig, ax = plt.subplots(nrows=2, ncols=1) data = pd.DataFrame(list(generate())) state_rows = [ ['WA', 'SA', 'Tas', 'NSW'], ['Qld', 'NT', 'Vic'] ] for idx, subax in enumerate(ax): to_display = data[data.day.isin(state_rows[idx])] sns.violinplot( ax=subax, x="day", y="total_bill", hue="sex", data=to_display, palette="Set2", split=True, scale="count" ) subax.set_ylabel('') subax.set_xlabel('') subax.set_yticklabels(RANGES[::, 0][::-1]) subax.set_yticks(list(map(int, RANGES[::, 1][::-1]))) subax.legend_.remove() subax.set_ylim(0, 100) return fig
def plotResults(tr, resultKey='resultInputPsf', doRates=False, title='', asHist=False, doPrint=True, actuallyPlot=True): import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') import seaborn as sns sns.set(style="whitegrid", palette="pastel", color_codes=True) methods = ['ALstack', 'ZOGY', 'SZOGY', 'ALstack_decorr'] tr = [t for t in tr if t is not None and t[resultKey]] FN = pd.DataFrame({key: np.array([t[resultKey][key]['FN'] for t in tr]) for key in methods}) FP = pd.DataFrame({key: np.array([t[resultKey][key]['FP'] for t in tr]) for key in methods}) TP = pd.DataFrame({key: np.array([t[resultKey][key]['TP'] for t in tr]) for key in methods}) title_suffix = 's' if doRates: FN /= (FN + TP) FP /= (FN + TP) TP /= (FN + TP) title_suffix = ' rate' if doPrint: print 'FN:', '\n', FN.mean() print 'FP:', '\n', FP.mean() print 'TP:', '\n', TP.mean() if not actuallyPlot: return TP, FP, FN matplotlib.rcParams['figure.figsize'] = (18.0, 6.0) fig, axes = plt.subplots(nrows=1, ncols=2) if not asHist: sns.violinplot(data=TP, cut=True, linewidth=0.3, bw=0.25, scale='width', alpha=0.5, ax=axes[0]) if TP.shape[0] < 500: sns.swarmplot(data=TP, color='black', size=3, alpha=0.3, ax=axes[0]) sns.boxplot(data=TP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[0]) plt.setp(axes[0], alpha=0.3) axes[0].set_ylabel('True positive' + title_suffix) axes[0].set_title(title) sns.violinplot(data=FP, cut=True, linewidth=0.3, bw=0.5, scale='width', ax=axes[1]) if FP.shape[0] < 500: sns.swarmplot(data=FP, color='black', size=3, alpha=0.3, ax=axes[1]) sns.boxplot(data=FP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[1]) plt.setp(axes[1], alpha=0.3) axes[1].set_ylabel('False positive' + title_suffix) axes[1].set_title(title) else: for t in TP: sns.distplot(TP[t], label=t, norm_hist=False, ax=axes[0]) axes[0].set_xlabel('True positive' + title_suffix) axes[0].set_title(title) legend = axes[0].legend(loc='upper left', shadow=True) for t in FP: sns.distplot(FP[t], label=t, norm_hist=False, ax=axes[1]) axes[1].set_xlabel('False positive' + title_suffix) axes[1].set_title(title) legend = axes[1].legend(loc='upper left', shadow=True) return TP, FP, FN
def _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax, cmap, n_cat=5, plottype="box"): """ Plot a categorical variable and a continuous variable against each other. Types of plots include box plot, violin plot, strip plot and swarm plot. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis ylabel : str The column name for the variable on the y-axis ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles n_cat : int The number of categories; used for creating the colour map plottype : {"box" | "violin" | "strip" | "swarm"} The type of plot to produce; default is a box plot Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ if x_keys is xlabel: keys = y_keys elif y_keys is ylabel: keys = x_keys else: raise Exception("Something went terribly, horribly wrong!") current_palette = sns.color_palette(cmap, n_cat) if plottype == "box": sns.boxplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "strip": sns.stripplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "swarm": sns.swarmplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "violin": sns.violinplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) else: raise Exception("plottype not recognized!") return ax
def plot(self, event, logliks, logsumexps, modality_colors, renamed=''): modality = logsumexps.idxmax() sns.violinplot(event.dropna(), bw=0.2, ax=self.ax_violin, color=modality_colors[modality]) self.ax_violin.set_ylim(0, 1) self.ax_violin.set_title('Guess: {}'.format(modality)) self.ax_violin.set_xticks([]) self.ax_violin.set_yticks([0, 0.5, 1]) # self.ax_violin.set_xlabel(renamed) for name, loglik in logliks.iteritems(): # print name, self.ax_loglik.plot(loglik, 'o-', label=name, color=modality_colors[name]) self.ax_loglik.legend(loc='best') self.ax_loglik.set_title('Log likelihoods at different ' 'parameterizations') self.ax_loglik.grid() self.ax_loglik.set_xlabel('phantom', color='white') for i, (name, height) in enumerate(logsumexps.iteritems()): self.ax_bayesfactor.bar(i, height, label=name, color=modality_colors[name]) self.ax_bayesfactor.set_title('$\log$ Bayes factors') self.ax_bayesfactor.set_xticks([]) self.ax_bayesfactor.grid() self.fig.tight_layout() self.fig.text(0.5, .025, '{} ({})'.format(event.name, renamed), fontsize=10, ha='center', va='bottom') sns.despine() return self
def violin_nocomp(lst_for_exclusion, binary_data_frame, tipo,xentry,df_name): yes = [] datalst = [] no = [] for alpha in binary_data_frame.index: if alpha in lst_for_exclusion: datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'%s miRNAs' %(tipo)]) yes.append(sum(binary_data_frame.loc[alpha].tolist())) else: datalst.append([sum(binary_data_frame.loc[alpha].tolist()),'Non-%s miRNAs' %(tipo)]) no.append(sum(binary_data_frame.loc[alpha].tolist())) print mean(yes), mean(no) print median(yes), median(no) print mannwhitneyu(yes, no) data_master = pd.DataFrame(datalst,columns=[xentry, 'miRNA Class']) sns.violinplot(x='miRNA Class',y=xentry,data=data_master, cut=0) if 'tis' in df_name: plt.gca().set_ylim([0,20]) if 'tar' in df_name: plt.gca().set_ylim([0,1000]) plt.savefig('figures/nocomp_violin_%s.pdf' %(df_name),bbox_inches='tight') plt.close()
def stripplot_mean_score(df, save_path, atlas=None, suffix=None, x=None, y=None, hue=None, style='whitegrid', fontsize=14, jitter=.2, figsize=(9, 3), leg_pos=2, axx=None): def change_label_name(row, label): row[label] = new_names[row[label]] return row ylabel = atlas aliases = {'kmeans': 'K-Means', 'ica': 'GroupICA', 'dictlearn': 'Dictionary Learning', 'basc': 'BASC'} if atlas == 'kmeans': new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) else: new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) # change the name of the dataset to upper df['dataset'] = df['dataset'].str.upper() # make labels of the y axes shorter # df[y] = df[y].str.wrap(13) rc('xtick', labelsize=12) rc('ytick', labelsize=16) rc('axes', labelweight='bold') # string.capitalize rc('legend', fontsize=fontsize) n_data = len(df['dataset'].unique()) palette = color_palette(n_data) # draw a default vline at x=0 that spans the yrange axx.axvline(x=0, linewidth=4, zorder=0, color='0.6') sns.violinplot(data=df, x=x, y=y, fliersize=0, linewidth=2, boxprops={'facecolor': '0.5', 'edgecolor': '.0'}, width=0.5, ax=axx) sns.stripplot(data=df, x=x, y=y, hue=hue, edgecolor='gray', size=3, split=True, palette=datasets_palette, jitter=jitter, ax=axx) axx.set_xlabel('') # axx.set_ylabel(aliases[ylabel], fontsize=15) axx.set_ylabel('') plt.text(.5, 1.02, aliases[key], transform=ax.transAxes, size=15, ha='center') # make the positive labels with "+" axx_xticklabels = [] for x in axx.get_xticks(): if x > 0: axx_xticklabels.append('+' + str(x)) else: axx_xticklabels.append(str(x)) axx.set_xticklabels(axx_xticklabels)
def plot_balanced_accuracy_violin(balanced_accuracy_samples, ax=None): """ Make a violin plot of the balanced posterior accuracy. Parameters ---------- balanced_accuracy_samples : dict Where the keys are the classifier names and the each value is an array of sample points from which an empirical pdf can be approxmiated. ax : Matplotlib Axes object A matplotlib Axes instance. Returns ------- ax : Matplotlib Axes object The matplotlib Axes instance where the figure is drawn. """ if not ax: ax = plt.gca() sns.violinplot(data=balanced_accuracy_samples, ax=ax, inner='box', cut=2) format_as_percent_plot = lambda x, pos: "{:.1f}%".format(x * 100) ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot)) return ax
def plot_against_y(self, function=None, y_margin=0.1, lim=10, context="talk"): """Where colour is squared error or some other var""" # do linked plots here cat, cont, time = cat_cont_time(self.df[self.vars_of_interest]) # cat = self.df.columns[self.df.dtypes=='category'] # cont = self.df.columns[self.df.dtypes=='float64'] # first continuous cols = cat + cont + time cols = cols[:10] sns.set_context(context) fig, axs = plt.subplots(nrows=1, ncols=len(cols), sharey=True) for ax, col in zip(axs.flat, cols): if col in cont: sns.regplot(x=col, y=self.y, data=self.df, ax=ax) # g = sns.lmplot(x="total_bill", y=self.y, data=self.df) # then categorical # fig, axs = plt.subplots(nrows=1, ncols=len(cat), sharey=True) # for ax, col in zip(axs.flat, cat): elif col in cat: sns.violinplot(x=col, y=self.y, data=self.df, ax=ax) else: # plot timeseries self.df([self.y, col]).plot() y_min, y_max = self.df[self.y].min(), (self.df[self.y].max()) y_range = y_max - y_min plt.ylim(y_min - y_margin * y_range, y_max + y_margin * y_range) # g = sns.FacetGrid(self.df,col=self.df.columns[self.df.dtypes=='category'],row=self.y,sharey=True) # g.map(sns.violinplot) return fig
def run(self): """ Run the experiment """ speeds = arange(self.min_vel, self.max_vel, abs(self.max_vel-self.min_vel)/self.num_speeds) observations = DataFrame(empty((self.num_samples, len(speeds)))*NaN, columns=(["%.2f" % sp for sp in speeds])) for speed in speeds: self.errors = [] self.curr_vel = speed # initialize the speed _twist = Twist() _twist.linear.x = speed _twist.angular.z = self.radial_vel # keep publishing that speed until we have enough samples while len(self.errors) < self.num_samples: plt.cla() upper = min(len(self.errors), self.num_samples) # copy up to self.num_samples into dataframe observations["%.2f" % speed][0:upper] = self.errors[0:upper] # plot dataframe self.ax.set_xlabel('Linear Speed (m/s)',fontsize=16) self.ax.set_ylabel('Error (mm/s)',fontsize=16) self.ax.set_title('Currently driving at: %.2f m/s' % speed) violinplot(data=observations) set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) self.my_mpl.canvas.draw() self.pub.publish(_twist) sleep(0.01) spin()
def plot(): # read eui table df = pd.read_csv(os.getcwd() + '/csv/eui.csv') logger.debug('finished reading file eui.csv') #df = df[df['Region'] < 11] df.boxplot(column='EUI', by='Region') plt.ylabel('EUI') plt.xlabel('Region') plt.title('EUI by Region') P.savefig(os.getcwd() + '/plot2/EUIbyRegion.png') plt.close() import seaborn as sns grouped = df.groupby('Region') for name, group in grouped: sns.distplot(group['EUI']) plt.xlabel('EUI') plt.title('EUI Distribution') P.savefig(os.getcwd() + '/plot2/Region-' + str(name) + '-EUIdistribution.png') plt.close() df.sort(columns='Region', inplace=True) sns.violinplot(x = 'Region', y = 'EUI', data = df) plt.ylabel('EUI') plt.xlabel('Region') plt.title('EUI by Region Violin Plot') P.savefig(os.getcwd() + '/plot2/EUIbyRegionViolin.png') plt.close()
def violin_subplot(ax, df, p, ylab): sns.violinplot(x='group', y=p, hue='gender', axis=1, data=df, split=True, inner="quart", ax=ax) plt.xticks(rotation=10) plt.legend(loc=2) plt.xlabel('') plt.ylabel(ylab)
def violinplot(data_pd, feature_names): for column_index, column in enumerate(feature_names): if column_index%4 == 0: plt.figure(figsize=(10,10)) plt.subplot(2, 2, column_index%4 + 1) #print(column, data_pd) sb.violinplot(x='class', y=column, data=data_pd)
def CheckShannonIndex(self, labels=None, condition_dict=None, fig_title=None): # Description: calculate the Shannon entropy of all samples, and plot on boxplot # If labels is specified, also plot the entropy of samples in each of the labels. def ShannonIndex(numList): ## Calculate Shannon Entropy SU = sum(numList) SDI = 0.0 for num in numList: freq = float(num)/SU if freq>0: SDI = SDI - freq * np.log(freq) return SDI print('Making Shannon Diversity boxplot for all samples') # Calculate shannon entropy for each sample SDIs = pd.DataFrame(index=self.abun_df.index, columns=['SDI']) for sample in self.abun_df.index: SDIs.loc[sample, 'SDI'] = ShannonIndex(self.abun_df.loc[sample]) # Add metadata labels to the df containing SDIs SDIs = pd.concat([SDIs, self.meta_df], axis=1) SDIs['SDI'] = SDIs['SDI'].astype('float64') self.SDI = SDIs # Plot all boxplots, and save if fig_title was given if fig_title: fig_ext = fig_title.rsplit('.',1)[1] fig_title = fig_title.rsplit('.',1)[0] # First plot SDI of all samples if fig_title: ax = sb.violinplot(x=SDIs['SDI'], inner=None, saturation=0.35) ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_all.violinplot.' + fig_ext) plt.close() # Do the boxplot ax = sb.boxplot(x=SDIs['SDI']) ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_all.boxplot.' + fig_ext) plt.close() if labels: print('Making boxplots separated by labels: ') for label in labels: print(label + '...') # Try with seaborn library SDIs[label] = SDIs[label].astype('category') ax = sb.violinplot(x=label, y='SDI', data=SDIs, saturation=0.35, inner=None) ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_' + label + '.violinplot.' + fig_ext) plt.close(fig) # Boxplot ax = sb.boxplot(x=label, y='SDI', data=SDIs, saturation=0.35) ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_' + label + '.boxplot.' + fig_ext) plt.close()
def main(): # Univariate data ------------------------- # Generate data that are normally distributed x = randn(500) # Set the fonts the way I like them sns.set_context('poster') sns.set_style('ticks') #mystyle.set() # Scatter plot scatter(arange(len(x)), x) xlim([0, len(x)]) mystyle.printout('scatterPlot.png', xlabel='x', ylabel='y', title='Scatter') # Histogram hist(x) mystyle.printout('histogram_plain.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, default settings') hist(x,25) mystyle.printout('histogram.png', xlabel='Data Values', ylabel='Frequency', title='Histogram, 25 bins') # Cumulative probability density numbins = 20 plot(stats.cumfreq(x,numbins)[0]) mystyle.printout('CumulativeFrequencyFunction.png', xlabel='Data Values', ylabel='Cumulative Frequency') # Boxplot # The ox consists of the first, second (middle) and third quartile boxplot(x, sym='*') mystyle.printout('boxplot.png', xlabel='Values', title='Boxplot') boxplot(x, sym='*', vert=False) title('Boxplot, horizontal') xlabel('Values') show() # Errorbars x = arange(5) y = x**2 errorBar = x/2 errorbar(x,y, yerr=errorBar, fmt='o', capsize=5, capthick=3) xlim([-0.2, 4.2]) ylim([-0.2, 19]) mystyle.printout('Errorbars.png', xlabel='Data Values', ylabel='Measurements', title='Errorbars') # Violinplot nd = stats.norm data = nd.rvs(size=(100)) nd2 = stats.norm(loc = 3, scale = 1.5) data2 = nd2.rvs(size=(100)) # Use pandas and the seaborn package for the violin plot df = pd.DataFrame({'Girls':data, 'Boys':data2}) #sns.violinplot(df, color = ["#999999", "#DDDDDD"]) sns.violinplot(df) mystyle.printout('violinplot.png')
def sns_violinplot(x, y, hue, bw, scale, inner, split, orient, color, saturation): # pragma: no cover x, y, hue, inner, orient, color = ut.widget2py(x, y, hue, inner, orient, color) ax, fig, _ = ut.get_ax_fig_plt() sns.violinplot(x=x, y=y, hue=hue, data=data, order=None, hue_order=None, bw=bw, cut=2, scale=scale, scale_hue=True, gridsize=100, width=0.8, inner=inner, split=split, orient=orient, linewidth=None, color=color, palette=None, saturation=saturation, ax=ax, **kwargs)
def make_plot(self): """Make the violin plot with self.plotdata and self.snskwargs and configure the subplot according to self.fmt""" import seaborn as sns plt.sca(self.ax) sns.violinplot(self.plotdata, **self.snskwargs) self._configureaxes() plt.draw()
def twoviolins_nooutliers(df, dset='bcrp', model='logreg3', feats='ecfps1', pos_proportions_min=0.4, pos_proportions_max=0.8): # Filter out outliers or degenerated cases: "too imbalanced" df = df[(df.dset == dset) & (df.model == model) & (df.feats == feats)] balanced = df[(df['pos_proportion'] > pos_proportions_min) & (df['pos_proportion'] < pos_proportions_max)] plt.figure() sns.violinplot(balanced.auc, balanced.lso) plt.draw()
def plot_quantile_returns_violin(return_by_q, ylim_percentiles=None, ax=None): """ Plots a violin box plot of period wise returns for factor quantiles. Parameters ---------- return_by_q : pd.DataFrame - MultiIndex DataFrame with date and quantile as rows MultiIndex, forward return windows as columns, returns as values. ylim_percentiles : tuple of integers Percentiles of observed data to use as y limits for plot. ax : matplotlib.Axes, optional Axes upon which to plot. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ return_by_q = return_by_q.copy() if ylim_percentiles is not None: ymin = (np.nanpercentile(return_by_q.values, ylim_percentiles[0]) * DECIMAL_TO_BPS) ymax = (np.nanpercentile(return_by_q.values, ylim_percentiles[1]) * DECIMAL_TO_BPS) else: ymin = None ymax = None if ax is None: f, ax = plt.subplots(1, 1, figsize=(18, 6)) unstacked_dr = (return_by_q .multiply(DECIMAL_TO_BPS)) unstacked_dr.columns = unstacked_dr.columns.set_names('forward_periods') unstacked_dr = unstacked_dr.stack() unstacked_dr.name = 'return' unstacked_dr = unstacked_dr.reset_index() sns.violinplot(data=unstacked_dr, x='factor_quantile', hue='forward_periods', y='return', orient='v', cut=0, inner='quartile', ax=ax) ax.set(xlabel='', ylabel='Return (bps)', title="Period Wise Return By Factor Quantile", ylim=(ymin, ymax)) ax.axhline(0.0, linestyle='-', color='black', lw=0.7, alpha=0.6) return ax
def bar_box_violin_dot_plots(data, category_col, numeric_col, axes, file_name=None): sns.barplot(category_col, numeric_col, data=data, ax=axes[0]) sns.boxplot(category_col, numeric_col, data=data[data[numeric_col].notnull()], ax=axes[2]) sns.violinplot(category_col, numeric_col, data=data, kind='violin', inner="quartile", scale='count', split=True, ax=axes[3]) sns.stripplot(category_col, numeric_col, data=data, jitter=True, ax=axes[1]) sns.despine(left=True)
def FacetGrid(): sns.set_style("dark",{"axes.facecolor":"black"}) f, axes = plt.subplots(2,2, figsize=(12,8)) [Kde(i,axes) for i in range(0,2)] sns.violinplot(data=movies, x = 'Year', y='BudgetMillions', ax=axes[1,0],palette="YlOrRd") sns.kdeplot(movies.CriticRating,movies.AudienceRating,shade=True,shade_lowest=False,cmap='Blues_r',ax=axes[1,1]) sns.kdeplot(movies.CriticRating,movies.AudienceRating,cmap='gist_gray_r', ax=axes[1,1]) plt.gcf().canvas.set_window_title('Facet Grid') plt.show()
aspect=2) # In[36]: #Countplot sns.countplot(df_merged['cab_type'], hue=df_merged['name'], palette='plasma') # In[37]: sns.boxplot(data=df_merged, x='source', y='price', palette='Blues') # In[38]: sns.violinplot(data=df_merged, x='destination', y='price', palette="Set3", scale="width") # In[39]: #Jointplot sns.jointplot("price", "rain", data=df_merged, kind="hex", bins=15) # In[40]: df_merged.set_index('date').groupby('name')['price'].plot(legend=True) # In[41]: sns.distplot(df_merged.price,
new_df = new_df[new_df.Signal != 0] new_df['lognorm'] = np.log(new_df['Signal']) df_list.append(new_df) df = pd.concat(df_list) a4_dims = (15.7, 8.27) fig, ax = plt.subplots(figsize=a4_dims) groups = pd.read_csv('inputs/Groups.csv') groups.loc[groups['Group'] == 'Blank', 'Color'] = '#FF35E7' ax = sns.violinplot(x="lognorm", y="Sample", data=df,scale="count",inner='box',palette=groups['Color'].tolist()) ax.set_title('Violin Plot - lognorm') ax.set_ylabel('Sample') ax.set_xlabel('Normalized Intensity') plt.savefig(results_folder+'QC/plot.distribution.png',dpi=400) sum_intensity = pd.DataFrame(full_matrix.sum()) sum_intensity['sample'] = sum_intensity.index sum_intensity.columns = ['Sum Signal','Sample'] a4_dims = (11.7, 8.27) fig, ax = plt.subplots(figsize=a4_dims) ax = sns.barplot(x="Sum Signal", y="Sample", data=sum_intensity,palette=groups['Color'].tolist()) ax.set_title('Sample Sum Intensities') ax.set_ylabel('Sample')
# attr = df[i] # sns.distplot(attr) #for column in df: # print(column) # columnSeriesObj = df[column] # print(columnSeriesObj) x.plot.hist(bins=4) #plotting histograms print("test") Attr5 = df['Attr 5'] Attr5.plot.hist(bins=5) # plotting horizontal violin plots sns.set(style="whitegrid") tips = sns.load_dataset("Frequency") ax = sns.violinplot(x=tips["Attributes"]) #plotting scatter materix pd.plotting.scatter_matrix(df, alpha=0.5, figsize=(15, 15)) plt.show() #covariance tables np.random.seed(42) df=pd.DataFrame(np.random.randn(1000,9),columns=['Attr 4','Attr 5', 'Attr 6','Attr 7', 'Attr 8', 'Attr 9', 'Attr 10', 'Attr 11','Attr 12']) df.cov() #correlation tables and heat maps of covariance and corr df_corr = df.corr() print(df_corr.head()) data1 = df_corr.values fig1 = plt.figure()
# Just for the last 5 years all_data13=all_data[all_data['year']>2012] palette=sns.cubehelix_palette(5, start=2, rot=0, dark=0, light=.95, reverse=False) sns.pairplot(all_data13[all_data13['name']=='RDSB.L'].drop(['share_price_scaled'],axis=1), hue='year',palette=palette,size=4,markers='o', plot_kws=dict(s=50, edgecolor='b', linewidth=0)) #============================================================================== # Violin Plot Oil price on last 5 years #============================================================================== sns.set_style('whitegrid') palette=sns.cubehelix_palette(5, start=2.8, rot=0, dark=0.2, light=0.8, reverse=False) sns.violinplot(x='year', y='oil_price', data=all_data13[all_data13['name']=='RDSB.L'], inner='quart', palette=palette, trim=True) #============================================================================== # Violin Plot Oil price on last 5 years #============================================================================== sns.factorplot(x='year', y='share_price_scaled', col='name', col_wrap=3,kind='violin', split=True, data=all_data13,inner='quart', palette=palette, trim=True,size=4,aspect=1.2) sns.despine(left=True) #============================================================================== # joint plot using 5 years for Premier Oil #============================================================================== sns.jointplot('oil_price', 'share_price',data=all_data13[all_data13['name']=='PMO.L'],kind='kde', hue='year',size=6,ratio=2,color='red').plot_joint(sns.kdeplot, zorder=0, n_levels=20)
# The other part of the plot, the "whiskers", shows the extent of the points beyond the center of the distribution. Individual circles beyond *that* are outliers. # # This boxplot shows us that although all five wines recieve broadly similar ratings, Bordeaux-style wines tend to be rated a little higher than a Chardonnay. # # Boxplots are great for summarizing the shape of many datasets. They also don't have a limit in terms of numeracy: you can place as many boxes in the plot as you feel comfortable squeezing onto the page. # # However, they only work for interval variables and nominal variables with a large number of possible values; they assume your data is roughly normally distributed (otherwise their design doesn't make much sense); and they don't carry any information about individual values, only treating the distribution as a whole. # # I find the slightly more advanced `violinplot` to be more visually enticing, in most cases: # In[ ]: sns.violinplot( x='variety', y='points', data=reviews[reviews.variety.isin(reviews.variety.value_counts()[:5].index)] ) # A `violinplot` cleverly replaces the box in the boxplot with a kernel density estimate for the data. It shows basically the same data, but is harder to misinterpret and much prettier than the utilitarian boxplot. # ## Why seaborn? # # Having now seen both `pandas` plotting and the `seaborn` library in action, we are now in a position to compare the two and decide when to use which for what. # # Recall the data we've been working with in this tutorial is in: # In[ ]: reviews.head()
import matplotlib.pyplot as plt import seaborn as sns data = [] with open('./data/met/diff_met.bed', 'r') as f: for line in iter(f): data.append(int(line)) fig = plt.figure() sns.set(style="whitegrid") ax = sns.violinplot(x=data) fig.savefig('./data/met/met_windows.png')
f = plt.gcf() f.set_size_inches(wd, ht) sns.despine() plt.ylim(0, 1.1) plt.yticks([0, .5, 1]) plt.xticks(rotation=45) plt.title( 'Pillow model does better longer time\nPillow has slow derivative only\nSTM has 3 derivatives' ) plt.tight_layout() # ========================== f = plt.figure(figsize=(wd, ht)) sns.violinplot(data=df_melt, x='kernels', y='Pearson Correlation', hue='model', split=True, inner='quartile', palette='Set2', legend_out=False) sns.despine() plt.grid('on', axis='y') plt.yticks([0, .5, 1]) plt.xticks(rotation=45) plt.tight_layout() # ====================== # plot pct_diff (how much better the pillow is) wd = figsize[0] / 1.5 ht = figsize[0] / 1.5 df_merged = df_merged.merge( df_pillow_drops[['full', 'id', 'kernels', 'stim_responsive']], on=['id', 'kernels', 'stim_responsive'])
output = "/neurospin/brainomics/2016_schizConnect/2018_analysis_2ndpart_clinic/\ results/clustering/nudast_only_clustering/correction_age_sex_site/3_clusters_solution" df = pd.DataFrame() score = df_scores["vocabsca"].astype(np.float).values df["labels"]=labels_cluster[np.array(np.isnan(score)==False)] LABELS_DICT = {0: "cluster 1", 1: "cluster 2", 2: "cluster 3"} df["labels_name"] = df["labels"].map(LABELS_DICT) df["vocabsca"] = score[np.array(np.isnan(score)==False)] T,p = scipy.stats.f_oneway(df[df["labels"]==0]["vocabsca"],\ df[df["labels"]==1]["vocabsca"],\ df[df["labels"]==2]["vocabsca"]) ax = sns.violinplot(x="labels_name", y="vocabsca", data=df,order=["cluster 1","cluster 2","cluster 3"]) plt.title("ANOVA: t = %s, and p= %s"%(T,p)) plt.savefig(os.path.join(output,"vocabsca.png")) df = pd.DataFrame() score = df_scores["cvlfps"].astype(np.float).values df["labels"]=labels_cluster[np.array(np.isnan(score)==False)] df["labels"]=labels_cluster[np.array(np.isnan(score)==False)] LABELS_DICT = {0: "cluster 1", 1: "cluster 2", 2: "cluster 3"} df["labels_name"] = df["labels"].map(LABELS_DICT) df["cvlfps"] = score[np.array(np.isnan(score)==False)] T, p = scipy.stats.f_oneway(df[df["labels"]==0]["cvlfps"],\ df[df["labels"]==1]["cvlfps"],\
listkey.append(key) listval.append(val) # print key, val, u' ', df = pd.DataFrame(listval, columns=[u'次数']) df.index = listkey df.plot(kind='bar') plt.title(u'词频统计') plt.show() # Number of words in the text ## dataset["num_words"] = dataset["text"].apply(lambda x: len(str(x).split())) dataset['num_words'].loc[ dataset['num_words'] > 1000] = 1000 # truncation for better visuals plt.figure(figsize=(12, 8)) sns.violinplot(x='first_class', y='num_words', data=dataset) plt.xlabel('First Class', fontsize=12) plt.ylabel('Number of words in text', fontsize=12) plt.title("Number of words in First Class", fontsize=15) plt.show() # global len distribution. plt.figure(figsize=(12, 8)) plt.hist(dataset["num_words"], bins=200, range=[10, 1000], color=pal[1], normed=True, label='train') plt.title('Normalised histogram of words count in text', fontsize=15) plt.legend()
# # ### But we can see from the table that winning a round is equally distributed on the maps Overpass and Train. # ### Interesting...! plt.figure(figsize=(10, 12)) sns.countplot(x='winner_side', hue='is_bomb_planted', data=df, palette='RdBu_r') # After planting the bomb there is more probability of CT side to win the round. plt.figure(figsize=(10, 12)) sns.countplot(x='winner_side', hue='round_type', data=df, palette='coolwarm') plt.figure(figsize=(10, 12)) sns.countplot(x='map', hue='round_type', data=df) # Force buy and eco work well in Mirage..and works worst in inferno or overpass depending upon the frequency of matches you play. plt.figure(figsize=(10, 12)) sns.countplot(x='winner_side', hue='round', data=df) plt.figure(figsize=(8, 10)) sns.boxplot(x='map', y='avg_match_rank', data=df, palette='magma') plt.figure(figsize=(10, 12)) sns.violinplot(x='map', y='avg_match_rank', data=df, palette='coolwarm') sns.lmplot(x='round', y='avg_match_rank', data=df, hue='map')
# # 1. Start by creating a variable `ax` and setting it equal to `sns.violinplot()`. This will instantiate a figure and give us access to the axes through the variable name `ax`. # 2. Use `sns.violinplot()` and pass in the following arguments: # + The `Quarter` column as the `x` values # + The `Price` column as your `y` values # + The `netflix_stocks_quarterly` dataframe as your `data` # 3. Improve the readability of the chart by adding a title of the plot. Add `"Distribution of 2017 Netflix Stock Prices by Quarter"` by using `ax.set_title()` # 4. Change your `ylabel` to "Closing Stock Price" # 5. Change your `xlabel` to "Business Quarters in 2017" # 6. Be sure to show your plot! # # In[105]: plt.figure(figsize=(10, 7)) ax = sns.violinplot(data=netflix_stocks_quarterly, x='Quarter', y='Price') ax.set_title('Netflix \'17 Stock Price Distribution', color='#400090', fontsize=25, fontweight='bold', pad=20) ax.set_xlabel('Quarters') ax.set_ylabel('Stock Price') plt.gca().set_yticklabels( ['${:,.0f}'.format(x) for x in plt.gca().get_yticks()]) plt.savefig('Netflix 2017 Stock Price Dist by Quarter.png') # ## Graph Literacy # - What are your first impressions looking at the visualized data? #
axes[1].set(xlabel='') axes[0].set(xlabel='') axes[0].set_title('Indoor Water Use') # #### Shower events duration compared to RWEUS2016 Study # + Residential Water End Use Study (RWEUS2016) URL: https://www.circleofblue.org/wp-content/uploads/2016/04/WRF_REU2016.pdf # + Using Violinplot with Seaborn and Matplotlib # + Violin plot is a combination of bar and kernel density plots # + The width of the violin represent the probability where skinner sections represent a lower probability # + Add a horizontal line that represent the average shower duration from the REWUS2016 study # In[8]: ShowerEvents = Events[Events.Label == "shower"] ax = sns.violinplot(x="Label", y="Duration(min)", data=ShowerEvents, palette="colorblind") sns.despine(right=True) ax.set(xlabel='', ylabel='Duration(min)') ax.axhline(y=8, c='red') # #### Daily and hourly water use # In[9]: # Aggregate pulses by hour and calculate the average number of pulses per each hour Use_Hour = RawData.groupby(RawData.index.hour).mean() Use_Hour.Pulses = Use_Hour.Pulses * 0.041619 * 15 * 60 # where 0.041619 is the meter resoultion, 15 is the number of 4 seconds in one minute (60/4) # and 60 is the number of minutes in an hour # In[10]:
def plot_pm(report): sns.violinplot(x=report["pm2.5"]) plt.show()
def E1_vs_insulation_scatterplor(E1, E1_resolution, k, averaged, boundary_index, domains): figure_path = "results/" + os.path.basename( domains_file) + ".Insulation_violinplot.png" if os.path.isfile(figure_path) and not redraw_figs: return # first compute genome-wide average of E1 differences def doesE1overlapDomain(e1, boundaries): overlap = boundaries.loc[e1.chr].index.overlaps( pd.Interval(e1.start, e1.end, closed="both")) return np.any(overlap) # remove E1 overlaping TAD boundaries TADboundaries = pd.DataFrame({"chr": domains.chr, "vals": domains.chr}) TADboundaries["intervals"] = pd.arrays.IntervalArray.from_arrays( domains.start - k * E1_resolution, domains.start + k * E1_resolution, closed="both") TADboundaries.index = pd.MultiIndex.from_frame( TADboundaries[["chr", "intervals"]]) E1["contains_TAD_boundary"] = E1.apply(doesE1overlapDomain, boundaries=TADboundaries, axis="columns") print(sum(E1["contains_TAD_boundary"].values), " out of ", len(E1["contains_TAD_boundary"]), "E1 bins are located near TAD boundary") E1 = pd.DataFrame(E1) # copy E1 dataframe temp = [ E1["E1"].shift(periods=i).values for i in np.arange(0, 2 * k + 1)[::-1] ] temp = np.vstack(temp).T temp = temp[np.logical_and(~np.isnan(temp).any(axis=1), ~E1["contains_TAD_boundary"].values)] print("After filtering, ", len(temp), " bins left to compute expected E1 diff") expected_E1_average = np.vstack((np.average(temp[:, :boundary_index], axis=1), np.average(temp[:, boundary_index + 1:], axis=1))).T expected_E1_diff = np.abs( np.subtract(expected_E1_average[:, 0], expected_E1_average[:, 1])) E1diff = np.abs(np.subtract(averaged[:, 0], averaged[:, 1])) from scipy.stats import mannwhitneyu with open(figure_path + ".stats.txt", "w") as fout: fout.write("Obseved average: " + str(np.average(E1diff)) + "\n") fout.write("Obsrved average: " + str(np.average(E1diff)) + "\n") statistic, pval = mannwhitneyu(E1diff, expected_E1_diff, alternative="two-sided") fout.write("mannwhitneyu 2-sided test: " + str(pval) + "\n") print("mannwhitneyu 2-sided test: " + str(pval)) print("--Drowing violinplot") plot_data = { "label": ["Expected cePC1 diff"] * len(expected_E1_diff) + ["TAD boundaries cePC1 diff"] * len(E1diff), "|cePC1_left-cePC1_right|": expected_E1_diff.tolist() + E1diff.tolist(), "x": [shortname] * (len(expected_E1_diff) + len(E1diff)) } plot_data = pd.DataFrame(plot_data) fig, ax = plt.subplots(figsize=(4, 8)) vp = sns.violinplot(ax=ax, x="x", y="|cePC1_left-cePC1_right|", hue="label", data=plot_data, split=True, inner="quartile") vp.legend_.remove() vp.set_xlabel("") plt.savefig(figure_path, dpi=300) plt.clf() return # Uncomment following to draw scatterplot """
def EDA(df, labels, target_variable_name, data_summary_figsize=(12, 12), corr_matrix_figsize=(12, 12), data_summary_figcol="Reds_r", corr_matrix_figcol='Blues', corr_matrix_annot=False, pairplt_col='all', pairplt=False, feature_division_figsize=(12, 12)): out_folder = '../figures/' start_time = timeit.default_timer() #for converting class labels into integer values if df[target_variable_name].dtype == 'object': class_labels = df[target_variable_name].unique().tolist() class_labels = [x for x in class_labels if type(x) == str] class_labels = [x for x in class_labels if str(x) != 'nan'] for i in range(len(class_labels)): df[target_variable_name][df[target_variable_name] == class_labels[i]] = i df_orig = df #print('The data looks like this: \n',df_orig.head()) #print('\nThe shape of data is: ',df_orig.shape) #To check missing values #print('\nThe missing values in data are: \n',pd.isnull(df_orig).sum().sort_values(ascending=False)) ax1 = sns.heatmap(pd.isnull(df_orig), cmap=sns.diverging_palette(240, 0, as_cmap=True)) plt.title("Missing Values Summary", fontsize=(15), color="blue") fig1 = ax1.get_figure() fig1.savefig(f"{out_folder}Missing_Values_Summary.png") #Descriptive Statistics #print('\nThe summary of data is: \n',df_orig.describe()) fig2 = plt.figure(figsize=data_summary_figsize) sns.heatmap(df_orig.describe()[1:].transpose(), annot=True, fmt=".1f", linecolor="black", linewidths=0.3, cmap=data_summary_figcol) plt.title("Data Summary", fontsize=(15), color="blue") fig2.savefig(f"{out_folder}Summary_Statistics.png") #print('\nSome useful data information: \n') #print(df_orig.info()) #print('\nThe columns in data are: \n',df_orig.columns.values) null_cutoff = 0.5 numerical = numericalCategoricalSplit(df_orig)[0] categorical = numericalCategoricalSplit(df_orig)[1] null_numerical = nullFind(numerical)[0] null_categorical = nullFind(categorical)[1] null = pd.concat([null_numerical, null_categorical]) null_df = pd.DataFrame({ 'Null_in_Data': null }).sort_values(by=['Null_in_Data'], ascending=False) null_df_many = (null_df.loc[(null_df.Null_in_Data > null_cutoff * len(df_orig))]) null_df_few = ( null_df.loc[(null_df.Null_in_Data != 0) & (null_df.Null_in_Data < null_cutoff * len(df_orig))]) many_null_col_list = null_df_many.index few_null_col_list = null_df_few.index #remove many null columns df_orig.drop(many_null_col_list, axis=1, inplace=True) df_wo_null = (removeNullRows(df_orig, few_null_col_list)) if df_wo_null[target_variable_name].dtype == 'object': df_wo_null[target_variable_name] = df_wo_null[ target_variable_name].astype(str).astype(int) df = df_wo_null[df_wo_null.select_dtypes(exclude=['object']).columns] #Check correlation matrix corr = df.corr() mask = np.zeros_like(corr, dtype=bool) mask[np.triu_indices_from(mask)] = True fig3 = plt.figure(figsize=corr_matrix_figsize) sns.heatmap(corr, mask=mask, cmap=corr_matrix_figcol, annot=corr_matrix_annot) plt.tight_layout() fig3.savefig(f"{out_folder}Correlation_Matrix.png") col = df.columns.values number_of_columns = len(col) number_of_rows = len(col) - 1 / number_of_columns #To check Outliers fig4 = plt.figure(figsize=(number_of_columns, number_of_rows)) for i in range(0, len(col)): #plt.subplot(number_of_rows + 1,number_of_columns,i+1) if number_of_columns % 2 == 0: plt.subplot(number_of_columns / 2, 2, i + 1) sns.set_style('whitegrid') sns.boxplot(df[col[i]], color='green', orient='h') plt.tight_layout() else: plt.subplot((number_of_columns + 1) / 2, 2, i + 1) sns.set_style('whitegrid') sns.boxplot(df[col[i]], color='green', orient='h') plt.tight_layout() fig4.savefig(f"{out_folder}Outliers.png") #To check distribution-Skewness for i in range(0, len(col)): fig, axis = plt.subplots(1, 2, figsize=(16, 5)) sns.distplot(df_orig[col[i]], kde=True, ax=axis[0]) axis[0].axvline(df_orig[col[i]].mean(), color="k", linestyle="dashed", label="MEAN") axis[0].legend(loc="upper right") axis[0].set_title('distribution of {}. Skewness = {:.4f}'.format( col[i], df_orig[col[i]].skew())) sns.violinplot(x=target_variable_name, y=col[i], data=df_orig, ax=axis[1], inner='quartile') axis[1].set_title('violin of {}, split by target'.format(col[i])) fig.savefig(f"{out_folder}Distribution Skewness of {col[i]}.png") #to construct pairplot if (pairplt == True) and (pairplt_col != 'all'): ax_pp = sns.pairplot(data=df, vars=pairplt_col, hue=target_variable_name) fig_pp = ax_pp.get_figure() fig_pp.savefig(f"{out_folder}Pair plot.png") elif (pairplt == True) and (pairplt_col == 'all'): fig_pp = sns.pairplot(data=df, vars=df.columns.values, hue=target_variable_name) fig_pp = ax_pp.get_figure() fig_pp.savefig(f"{out_folder}Pair plot.png") #Proportion of target variable in dataset st = df[target_variable_name].value_counts().sort_index() #print('\nThe target variable is divided into: \n',st) #how many belong to each class of target variable fig5 = plt.figure(figsize=feature_division_figsize) plt.subplot(121) ax = sns.countplot(y=df_orig[target_variable_name], linewidth=1, edgecolor="k" * 2) for i, j in enumerate(st): ax.text(.7, i, j, weight="bold", fontsize=27) plt.title("Count for target variable in datset") plt.subplot(122) plt.pie(st, labels=labels, autopct="%.2f%%", wedgeprops={ "linewidth": 2, "edgecolor": "white" }) my_circ = plt.Circle((0, 0), .7, color="white") plt.gca().add_artist(my_circ) plt.subplots_adjust(wspace=.2) plt.title("Proportion of target variable in dataset") fig5.savefig(f"{out_folder}Outcome Variable.png") #print('\nThe numerical features are: \n',df_wo_null.select_dtypes(exclude=['object']).columns.tolist()) #print('\nThe categorical features are: \n',df_wo_null.select_dtypes(include=['object']).columns.tolist()) #Proportion of categorical variables in dataset if len(df_wo_null.select_dtypes(include=['object']).columns.tolist()) >= 1: for cat_feat in df_wo_null.select_dtypes( include=['object']).columns.tolist(): ct = df_wo_null.select_dtypes( include=['object'])[cat_feat].value_counts().sort_values( ascending=False) print('\nThe categorical variable is divided into: \n', ct) #how many belong to each class of target variable if (ct.index.size) < 50: fig_cat = plt.figure(figsize=feature_division_figsize) plt.subplot(121) ax = sns.countplot( y=df_wo_null.select_dtypes(include=['object'])[cat_feat], linewidth=1, edgecolor="k" * 2) for i, j in enumerate(ct): ax.text(.7, i, j, weight="bold", fontsize=27) plt.title("Count for categorical variable in datset") plt.subplot(122) plt.pie(ct, labels=df_wo_null.select_dtypes( include=['object'])[cat_feat].unique().tolist(), autopct="%.2f%%", wedgeprops={ "linewidth": 2, "edgecolor": "white" }) my_circ = plt.Circle((0, 0), .7, color="white") plt.gca().add_artist(my_circ) plt.subplots_adjust(wspace=.2) plt.title("Proportion of categorical variable in dataset") fig_cat.savefig(f"{out_folder}Categorical Variable.png") else: print( '\nThe categorical variable %s has too many divisions to plot \n' % cat_feat) continue elapsed = timeit.default_timer() - start_time print('\nExecution Time for EDA: %.2f minutes' % (elapsed / 60)) return df_wo_null, df_wo_null.select_dtypes( exclude=['object']).columns.tolist(), df_wo_null.select_dtypes( include=['object']).columns.tolist()
import seaborn as sns import matplotlib.pyplot as plt df = sns.load_dataset("tips") sns.set() sns.violinplot(x="day", y="total_bill", hue="sex", split=True, data=df) plt.show()
plt.subplot(131) for a in range(5): for b in range(5): if a+1<b+1: scatter_plot_by_category('species',df.columns[a+1],df.columns[b+1]) plt.xlabel(df.columns[a+1]) plt.ylabel(df.columns[b+1]) plt.title('species') plt.show() plt.figure(figsize=(20, 10)) #利用seaborn库绘制三种Iris花不同参数图 for column_index, column in enumerate(df.columns): if column == 'species': continue plt.subplot(3,2, column_index + 1) sb.violinplot(x='species',y=column,data=df) plt.show() # 首先对数据进行切分,即划分出训练集和测试集 from sklearn.model_selection import train_test_split #调入sklearn库中交叉检验,划分训练集和测试集 all_inputs = df[['alcohol', 'malic_acid', 'ash', 'alcalinity ash', 'magnesium']].values all_species = df['species'].values (X_train, X_test, Y_train, Y_test) = train_test_split(all_inputs, all_species, train_size=0.7, random_state=1)#70%的数据选为训练集
depth2 = len(eles[6].split(',')) eles[15] = log(float(eles[15])) eles[13] = log(float(eles[13])) eles[14] = log(float(eles[14])) eles[18] = log(float(eles[18])) eles[16] = log(float(eles[16])) eles[17] = log(float(eles[17])) print >> outh, "\t".join( ["wt", eles[9], eles[11], eles[15], eles[13], eles[14]]) print >> outh, "\t".join( ["ko", eles[10], eles[12], eles[18], eles[16], eles[17]]) fh.close() outh.close() ''' violin plot df = pd.read_table (sys.argv[2]) fontsize = 10 fig, axes = plt.subplots() header = list(df) del header[0] fontsize = 10 fig, axes = plt.subplots() sns.violinplot(header[0],'deletions',data=df,ax=axes) axes.set_xlabel(header[0]) axes.set_ylabel('deletions') plt.savefig ("5.pdf",format="pdf") '''
def pushButtonClicked(self): code = self.lineEdit.text() if code == "0": ''' self.axex.clear() self.fig = sns.kdeplot(df["NOX"], df["LSTAT"]) self.canvas.draw() ''' self.fig1.clear() self.fig1.clear() self.fig1 = sns.kdeplot(df["NOX"], df["LSTAT"]) self.fig1.clear() self.fig1 = sns.kdeplot(df["NOX"], df["LSTAT"]) self.canvas.draw() elif code == "1": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", data=df) self.fig1.clear() self.fig1 = sns.violinplot(x="RM_int", y="MEDV", data=df) self.canvas.draw() elif code == "2": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", data=df, orient="v") self.fig1.clear() self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", data=df, orient="v") self.canvas.draw() elif code == "3": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.canvas.draw() elif code == "4": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["CRIM"], df["MEDV"]) plt.xlabel("Per capita crime rate by town (CRIM)") plt.ylabel("Housing Price") plt.title("Relationship between CRIM and Price") self.canvas.draw() elif code == "5": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["RM"], df["MEDV"]) plt.xlabel("Average number of rooms per dwelling(RM)") plt.ylabel("Housing Price") plt.title("Relationship between RM and Price") self.canvas.draw() elif code == "6": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["PTRATIO"], df["MEDV"]) plt.xlabel("Pupil-teacher ratio by town(PTRATIO)") plt.ylabel("Housing Price") plt.title("Relationship between PTRATIO and Price") self.canvas.draw() elif code == "7": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["ZN"], df["MEDV"]) plt.xlabel( "proportion of residential land zoned for lots over 25,000 sq.ft.(ZN)" ) plt.ylabel("Housing Price") plt.title("Relationship between ZN and Price") self.canvas.draw() elif code == "8": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["INDUS"], df["MEDV"]) plt.xlabel( "proportion of non-retail business acres per town(INDUS)") plt.ylabel("Housing Price") plt.title("Relationship between INDUS and Price") self.canvas.draw() elif code == "9": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = plt.scatter(df["NOX"], df["MEDV"]) plt.xlabel( "nitric oxides concentration (parts per 10 million(NOX)") plt.ylabel("Housing Price") plt.title("Relationship between NOX and Price") self.canvas.draw() elif code == "10": self.fig1 = sns.violinplot(x="RM_int", y="MEDV", hue="CHAS", split=True, data=df) self.fig1.clear() self.fig1 = sns.regplot(y="MEDV", x="RM", data=df, fit_reg=True) self.canvas.draw()
print( "no. The discrepencies between the life expectancy at birth in each country is much higher than the GDP in each country." ) # ## Step 6. Violin Plots To Compare Life Expectancy Distributions # Another way to compare two datasets is to visualize the distributions of each and to look for patterns in the shapes. # # We have added the code to instantiate a figure with the correct dimmensions to observe detail. # 1. Create an `sns.violinplot()` for the dataframe `df` and map `Country` and `LEABY` as its respective `x` and `y` axes. # 2. Be sure to show your plot # In[12]: fig = plt.subplots(figsize=(15, 10)) sns.violinplot(x="Country", y="LEABY", data=df) plt.show() plt.savefig('ViolinplotofLEABYbycountry.png') # What do you notice about this distribution? Which country's life expactancy has changed the most? # # ## Step 7. Bar Plots Of GDP and Life Expectancy over time # # We want to compare the GDPs of the countries over time, in order to get a sense of the relationship between GDP and life expectancy. # # First, can plot the progession of GDP's over the years by country in a barplot using Seaborn. # We have set up a figure with the correct dimensions for your plot. Under that declaration: # 1. Save `sns.barplot()` to a variable named `ax` # 2. Chart `Country` on the x axis, and `GDP` on the `Y` axis on the barplot. Hint: `ax = sns.barplot(x="Country", y="GDP")`
trainX, trainy, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # Compare Algorithms by accuracy measures during the 10-fold validation fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) #plt.boxplot(results) sns.violinplot(data=results, ax=ax) ax.set_xticklabels(names) plt.show() # Make predictions on validation dataset clf = LinearDiscriminantAnalysis() clf.fit(trainX, trainy) predictions = clf.predict(testX) print('accuracy score', accuracy_score(testy, predictions)) # from sklearn print('Confusion matrix from sklearn\n') print(confusion_matrix(testy, predictions)) # custom confusion matrix plt.figure() plt_cnf_matrix(confusion_matrix(testy, predictions),
plt.rc("grid", linestyle="dotted", color="gray", alpha=0.7) plt.grid() # creates a boxplot based off entire iris dataset [17] # uses inbuilt sns colour palette 'colorblind' [18] sns.boxplot(data=iris, palette="colorblind") # adds bold title to boxplot plt.title("Boxplot of Iris Variables", weight="bold") # saves resulting plot to designated subfolder plt.savefig("data-visualizations/boxplot - iris.png") # displays plot to user in pop up window plt.show() plt.rc("grid", linestyle="dotted", color="gray", alpha=0.7) plt.grid() # creates a violinplot based off entire iris dataset [19] sns.violinplot(data=iris, palette="colorblind") plt.title("Violinplot of Iris Variables", weight="bold") plt.savefig("data-visualizations/violinplot - iris.png") plt.show() ######################################################################################################################### # 6. Violinplots of Each Variable ######################################################################################################################### # the below code works the same as the above but instead of using the entire dataset, # individual numeric variables are chosen and plotted by species # no palette is mentioned so it uses the default set 'colors' palette # violinplot of sepal length plt.rc("grid", linestyle="dotted", color="gray", alpha=0.7) plt.grid()
relation_grade_ave = [ sum(data[data.Relation == i].numeric_class) / float(len(data[data.Relation == i])) for i in relation ] ax = sns.barplot(x=relation, y=relation_grade_ave) plt.title('Relation with father or mother affects success of students') # * Having relation with mum has positive effect on these students # * Students who have relation with their mum is more successful # In[ ]: #Lets look at how many times the student participate on discussion groups discussion = data.Discussion discussion_ave = sum(discussion) / len(discussion) ax = sns.violinplot(y=discussion, split=True, inner='quart') ax = sns.swarmplot(y=discussion, color='black') ax = sns.swarmplot(y=unsuccess.Discussion, color='red') plt.title('Discussion group participation') # * These two students are under the average of discussion. # * Average is 43. Therefore, participating discussion groups can be important success of these two students # In[ ]: # Now lastly lets look at absence_day = data.StudentAbsenceDays.unique() absense_day_ave = [ sum(data[data.StudentAbsenceDays == i].numeric_class) / float(len(data[data.StudentAbsenceDays == i])) for i in absence_day ]
x="labels_name", y="score", hue="Feature", data=df_complete, order=["Controls", "SCZ Cluster 1", "SCZ Cluster 2", "SCZ Cluster 3"]) plt.legend(loc='lower left') plt.savefig(os.path.join(output, "cluster_weights.png")) plt.figure() sns.set_style("whitegrid") ax = sns.barplot( x="labels_name", y="age", data=df, order=["Controls", "SCZ Cluster 1", "SCZ Cluster 2", "SCZ Cluster 3"]) plt.savefig(os.path.join(output, "age.png")) ############################################################################# #ANOVA on age T, p = scipy.stats.f_oneway(df[df["labels"]==0]["age"],\ df[df["labels"]==1]["age"],\ df[df["labels"]==2]["age"]) ax = sns.violinplot( x="labels_name", y="age", data=df, order=["Controls", "SCZ Cluster 1", "SCZ Cluster 2", "SCZ Cluster 3"]) plt.title("ANOVA patients: t = %s, and p= %s" % (T, p)) plt.savefig(os.path.join(output, "age_anova.png"))
#Kernel Density Estimate plot ... Bivariate Distribution k1 = sns.kdeplot(movies.Rotten_ratings, movies.Audience_ratings, shade=True) sns.set_style('dark') k2 = sns.kdeplot(movies.Budget_millions, movies.Audience_ratings) k3 = sns.kdeplot(movies.Budget_millions, movies.Rotten_ratings) # SUBPLOTS f, axes = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True) k2 = sns.kdeplot(movies.Budget_millions, movies.Audience_ratings, ax=axes[0]) k3 = sns.kdeplot(movies.Budget_millions, movies.Rotten_ratings, ax=axes[1]) k2.set(xlim=(-20, 160)) #boxplots vs violinplot v = sns.violinplot(data=movies, x='Genre', y='Rotten_ratings') w = sns.boxplot(data=movies, x='Genre', y='Rotten_ratings') #Genre specific violinplot broken down by year v2 = sns.violinplot(data=movies[movies.Genre == 'Drama'], x='Year', y='Rotten_ratings') #FacetGrid both lines of code must run together g = sns.FacetGrid(movies, row='Genre', col="Year", hue='Genre') kws = dict(s=50, linewidth=0.5, edgecolor='black') g = g.map(plt.scatter, 'Rotten_ratings', 'Audience_ratings', **kws) # facet grids can be populated with any type of chart g = sns.FacetGrid(movies, row='Genre', col="Year", hue='Genre') g = g.map(plt.hist, 'Budget_millions') #controlling axes and adding diagonals
sizes = [104,109,93,99,64] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue','purple'] explode = (0, 0.1, 0, 0,0) # explode 1st slice # Plot plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) plt.axis('equal') plt.title("Pie chart for Position") plt.show() #violin plot fig = sns.violinplot( y=DF["Position"], x=DF["PointsPS"] ).set_title("Violin plots for points") plt.xlabel("Points per Second") plt.ylabel("Position") plt.show(fig) #correlogram DF1 = DF[["FieldGoalsMadePS","FreeThrowsMadePS","ThreePointersMadePS","TwoPointersMadePS","Position"]] DF1 sns.pairplot(DF1, kind="scatter", hue="Position") #box fig = sns.boxplot(x="Position",y="StealsPS",data=DF).set_title("Steals boxplot") plt.ylabel("Steals per Second") plt.show(fig) #Marginal Plots
y='features', orient='h', data=dd, hue='Culture', ax=ax, notch=True, flierprops=flierprops, palette="hls" ) #Set2 is also somewhat okay, #sns.hls_palette(8, l=.3, s=.8)) else: sns.violinplot( x='value', y='features', orient='h', data=dd, hue='Culture', ax=ax, notch=True, flierprops=flierprops, palette="hls" ) #Set2 is also somewhat okay, #sns.hls_palette(8, l=.3, s=.8)) ax.get_legend().remove() ax.tick_params(axis="x", direction="in") ax.tick_params(axis="y", direction="in", pad=-2) ax.xaxis.set_major_locator(MultipleLocator(1)) if idx < 18: ax.set_title(r'$\bf{}$'.format(value_vars[0].replace('_mean', '')), loc='right', position=(1.0, 0.7), size=10) ax.yaxis.set_ticklabels(['mean', 'std'])
print("") print("Std Goal and Pledged values") print(round(df_kick[["goal", "pledged"]].std(), 2)) # <h2>Looking the State variable</h2> # - pledge log by state # - goal log by state # - goal log x pledged log # In[9]: plt.figure(figsize=(12, 8)) plt.subplots_adjust(hspace=0.75, top=0.75) ax1 = plt.subplot(221) ax1 = sns.violinplot(x="state", y="pledge_log", data=df_kick, palette="hls") ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45) ax1.set_title("Understanding the Pledged values by state", fontsize=15) ax1.set_xlabel("State Description", fontsize=12) ax1.set_ylabel("Pledged Values(log)", fontsize=12) ax2 = plt.subplot(222) ax2 = sns.violinplot(x="state", y="goal_log", data=df_kick) ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45) ax2.set_title("Understanding the Goal values by state", fontsize=15) ax2.set_xlabel("State Description", fontsize=12) ax2.set_ylabel("Goal Values(log)", fontsize=12) ax0 = plt.subplot(212) ax0 = sns.regplot(x="goal_log", y="pledge_log", data=df_kick, x_jitter=False) ax0.set_title("Better view of Goal x Pledged values", fontsize=15)
def plotParamsdf(df=None, number_points=0, box=False): if not type(df): df = getParamDistrib(number_points) param_names = [ r"$\gamma_L$", r"$\eta_x$", r"$\gamma_x$", r"$\theta_L$", r"$\omega_x$", r"$\theta_x$", r"$\delta_L$", r"$\delta_x$", #r"$\delta_y$", #to remove r"$\rho_x$", r"$n_y$", r"$m_x$" ] units = [ r"$nM/min$", r"$nM/min$", r"$nM/min$", r"$nM^{-1}$", r"$nM^{-1}$", r"$nM^{-1}$", r"$min^{-1}$", r"$min^{-1}$", r"$min^{-1}$", "", "" ] fig, axes = plt.subplots(4, 3) for i, (param_name, unit) in enumerate(zip(param_names, units)): if param_name: ax = axes.flat[i] if box: sns.boxplot(data=df[param_name], ax=ax) #,palette="Pastel1") else: sns.violinplot(data=df[param_name], ax=ax, cut=0, color="#3274a1") #,palette="Pastel1") ax.set_xticks([]) #ax.set_xticks([0]) #ax.set_xticklabels([param_name]) if unit: ax.set_ylabel(param_name + " [" + unit + "]") else: ax.set_ylabel(param_name) #ax.set_yscale('log') """ for param_id in range(len(param_names)): ax = axes.flat[param_id] sns.violinplot(y = param_names[param_id], x="Model id", data=df[[param_names[param_id], "Model id"]], ax = ax) #,palette="Pastel1") """ fig = plt.gcf() fig.set_size_inches([15, 12]) if box: plt.savefig(os.path.join('results_robustness', 'params_distrib_sns_box.pdf'), bbox_inches='tight') else: plt.savefig(os.path.join('results_robustness', 'params_distrib_sns.pdf'), bbox_inches='tight') plt.show()
print('Males: {m} , Females: {f}'.format(m=np.sum(sex == 'M'), f=np.sum(sex == 'F'))) print('Age : {m:.2f} +/- {s:.2f}'.format(m=np.mean(age), s=np.std(age))) print('X-ray Tube Current: ({a},{b})'.format(a=np.min(current), b=np.max(current))) print('KVP: ({a},{b})'.format(a=np.min(kvp), b=np.max(kvp))) print('Exposure Time: ({a},{b})'.format(a=np.min(time), b=np.max(time))) print('Exposure (mAs) min/max : ({a},{b})'.format( a=np.min(current * time / 1000), b=np.max(current * time / 1000))) print('Exposure (mAs) mean/std : ({a:.2f},{b:.2f})'.format( a=np.mean(current * time / 1000), b=np.std(current * time / 1000))) print('-----------------------------') # Visualization plt.figure() sns_plot = sns.violinplot(x='Diagnosis', y='mAs', data=csv_data, split=False) plt.ylabel('Exposure (mAs)') plt.xlabel('Disease') fig = sns_plot.get_figure() # fig.savefig("mas.png",dpi=300) plt.figure() sns_plot = sns.countplot(x='Diagnosis', data=csv_data, hue='Patient Sex') plt.ylabel('Number of Cases') plt.xlabel('Disease') fig = sns_plot.get_figure() # fig.savefig("sex.png",dpi=300) plt.figure() sns_plot = sns.boxplot(x='Diagnosis', y='Age', data=csv_data) plt.ylabel('Age (year)')