def plot_tests(genomes, pairs, out, plot, cats, y_lab, normalize = False): """ plot test data """ lengths = [] slopes = [] samples = [] n50s = [] for g, s in pairs: sample = genomes[g]['samples'][s] s = s.rsplit('.', 1)[0].replace('_', ' ') l, n50, m = sample['test'] lengths.extend(l) slopes.extend(m) samples.extend([s for i in m]) n50s.extend(n50) if normalize == 'log2': slopes = log_trans(slopes) slope_fs = pd.DataFrame({cats:lengths, y_lab:slopes, 'sample':samples, 'n50':n50s}) slope_fs.to_csv(out, sep = '\t') slope_fs = slope_fs[slope_fs[y_lab] != False] sns.set_style('whitegrid') sns.set_context('poster') sns_plot = sns.boxplot(x = cats, y = y_lab, data = slope_fs, \ hue = 'sample', palette = 'deep') sns.stripplot(x = cats, y = y_lab, data = slope_fs, \ hue = 'sample', palette = 'deep', \ jitter = True, size = 5, edgecolor = 'gray') plt.legend(loc = 'upper right', bbox_to_anchor=(1.05, 1)) sns_plot.figure.savefig('%s' % (plot), bbox_inches = 'tight')
def main(argv): # Lists of marker styles and line styles markers = 10 * ['o','^','x'] lines = 10 * ['-','--','-.'] infile = sys.argv[1] resframe = pd.read_csv(infile) print "Summary of all results found:" print resframe fig, ax = plt.subplots() # sns.pointplot(x='Writers', y='Write Bandwidth (MiB/s)', # data=resframe, hue='Scheme', scale=0.75, markers=markers, # linestyles=lines, estimator=np.median, dodge=True, ci=100.0) sns.stripplot(x='Writers', y='Write Bandwidth (MiB/s)', data=resframe, hue='Scheme', jitter=True, split=True) ax.set_ylim(ymin=0) plt.ylabel('Write Bandwidth / MiB/s') plt.xlabel('Writers') plt.legend() plt.savefig('dist_bandwidth_stats.png') plt.clf() sys.exit(0)
def plot_errors_for_elements(self, ax=None, **kwargs): """ Plot the relative errors associated to the chemical elements. """ dict_list = [] for idx, row in self.iterrows(): rerr = 100 * (row["this"] - row["ae"]) / row["ae"] for symbol in set(species_from_formula(row.formula)): dict_list.append(dict( element=symbol, rerr=rerr, formula=row.formula, struct_type=row.struct_type, )) frame = DataFrame(dict_list) order = sort_symbols_by_Z(set(frame["element"])) #print_frame(frame) import seaborn as sns ax, fig, plt = get_ax_fig_plt(ax=ax) # Draw violinplot #sns.violinplot(x="element", y="rerr", order=order, data=frame, ax=ax, orient="v") # Box plot ax = sns.boxplot(x="element", y="rerr", data=frame, ax=ax, order=order, whis=np.inf, color="c") # Add in points to show each observation sns.stripplot(x="element", y="rerr", data=frame, ax=ax, order=order, jitter=True, size=5, color=".3", linewidth=0) sns.despine(left=True) ax.set_ylabel("Relative error %") ax.grid(True) return fig
def make_plots(groups): sns.stripplot("ammo", "moa", data=groups, jitter=True) postprocess() plt.savefig("points.png") plt.clf() sns.boxplot("ammo", "moa", data=groups) postprocess() plt.savefig("boxplot.png") plt.clf() sns.barplot("ammo", "mean", data=groups, ci=None) plt.title("mean moa for best 9 of 10 five shot groups") plt.ylabel("moa") postprocess() plt.savefig("avg_moa.png") plt.clf() std = groups["standard"] std = std[std.notnull()] fig, axes = plt.subplots(ncols=2) sns.distplot(std, ax=axes[0]) stats.probplot(std, plot=axes[1]) fig.set_size_inches(6, 4) fig.tight_layout() plt.savefig("qqplot.png")
def view_distribution(df,x="type",y="rate", plt=plt): asset = df.symbol.values[0] plt.figure(1,figsize=(15,15)) sns.violinplot(x=x, y=y, data=df, inner=None) sns.stripplot(x=x, y=y, data=df, jitter=True, color="white", edgecolor="gray") plt.title(y+' distribution ('+asset+')') plt.show()
def Create_WildPlot(X, y1, y2, y3, y4): #Creates strip plot of x and y xlab = X.name ylab = 'Wilderness Area' xlab = xlab.replace("_"," ") figlab = ylab + " vs " + xlab filelab = "Plots/" + figlab.replace(" ","") + ".pdf" f, ax = plt.subplots(figsize=(5, 5)) y = y1 n = len(y) for i in range (0,n): if y1[i] == 1: y[i] = 1 elif y2[i] == 1: y[i] = 2 elif y3[i] == 1: y[i] = 3 elif y4[i] == 1: y[i] = 4 sns.stripplot(x = X, y = y, jitter = True, size = 5, linewidth = 0.1, ax = ax) sns.plt.title(figlab) sns.plt.xlabel(xlab) sns.plt.ylabel(ylab) savefig(filelab)
def _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax, cmap, n_cat=5, plottype="box"): """ Plot a categorical variable and a continuous variable against each other. Types of plots include box plot, violin plot, strip plot and swarm plot. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis ylabel : str The column name for the variable on the y-axis ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles n_cat : int The number of categories; used for creating the colour map plottype : {"box" | "violin" | "strip" | "swarm"} The type of plot to produce; default is a box plot Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ if x_keys is xlabel: keys = y_keys elif y_keys is ylabel: keys = x_keys else: raise Exception("Something went terribly, horribly wrong!") current_palette = sns.color_palette(cmap, n_cat) if plottype == "box": sns.boxplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "strip": sns.stripplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "swarm": sns.swarmplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "violin": sns.violinplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) else: raise Exception("plottype not recognized!") return ax
def p7(data): # Распределение выживших f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True) sns.stripplot( "Pclass", "Age", "Survived", data=data[data["Sex"] == "male"], palette="Set2", size=20, hue_order=(1, 0), marker="D", alpha=0.25, jitter=True, ax=ax1, ) ax1.set_title("MALE") sns.stripplot( "Pclass", "Age", "Survived", data=data[data["Sex"] == "female"], palette="Set2", size=20, marker="D", alpha=0.25, order=(1, 2, 3), jitter=True, ax=ax2, ) ax2.set_title("FEMALE") plt.show()
def stripplot_to_pdf(data, save_path, x=None, y=None, hue=None, style='whitegrid', fontsize=2, rows=1, cols=1, figsize=(4, 4), **kwargs): """ Data plotted as stripplot using seaborn and saved in a pdf given in save_path Parameters ---------- data : pd.DataFrame or path to csv file single or list of data to plot into pdf. save_path : str Path to save the pdf plot. """ if isinstance(data, basestring): data = pd.read_csv(data) if isinstance(data, (list, tuple)): cols = len(data) if not isinstance(data, (list, tuple)): data = [data, ] sns.set_style(style) sns.set(font_scale=fontsize) with PdfPages(save_path) as pdf: fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=figsize, squeeze=True, sharey=True) axes = axes.reshape(-1) for ax, d in zip(axes, data): sns.stripplot(x=x, y=y, hue=hue, data=d, ax=ax, **kwargs) pdf.savefig(fig) plt.close()
def timePlotScatter(data): geneNamesDict = {} for _, row in data.iterrows(): geneNamesDict[row['Gene']] = 1 data = data.pivot_table('Values', ['Sample'], ['Gene', 'Time']) geneList = geneNamesDict.keys() counter = 1 box = input("Do you want a boxplot for each timepoint? (y/n): ") normalize = input("Would you like to normalize the y-axis? (y/n): ") ylabel = input("What should the y-axis label be?: ") for key in geneList: title = key plt.figure(counter) tempTable = data[key] if normalize == 'y': tempTable = tempTable / np.amax(tempTable.values) title = key + " Normalized" if box == "y": makeBoxplot(tempTable) sns.stripplot(data=tempTable, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9)) plt.title(title) plt.ylabel(ylabel) plt.xlabel('Time(min)') counter += 1 plt.gca().set_ylim(bottom = 0) plt.show()
def conditionPlot(data): data = data.pivot_table('Values', ['Sample'], ['Gene', 'Condition']) answer = input("Do you want a boxplot to go with your data? (y/n): ") normalize = input("Would you like to normalize the y-axis? (y/n): ") if normalize == 'y': data = data / np.amax(data.fillna(0).values) if answer == "y": print("What kind of boxplot do you want?") print("\"1\" for a quartile boxplot.") print("\"2\" for a standard deviation boxplot.") boxStyle = input("Your answer: ") if boxStyle == "1": makeBoxplotQuartile(data) else: makeBoxplotDeviation(data) makeBoxplot(data) # Iterate through the columns, generating data to form the boxplot. sns.stripplot(data=data, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9)) ylabel = input("What should the y-axis label be?: ") plt.ylabel(ylabel) plt.xlabel('Gene/Condition') exportExcel(data); plt.show()
def plot_compare_median_consensus(output_dir, df_order, metric, type = 'ts',DISPLAY = 0): plt.figure() if type =='ts': #sb.tsplot(data=df_order, value=metric,time='order',unit="algorithm",condition="algorithm",err_style="unit_traces") ax = sb.boxplot(x=metric, y="algorithm", data=df_order, whis=np.inf, color="c") # Add in points to show each observation sb.stripplot(x=metric, y="algorithm", data=df_order, jitter=True, size=3, color=".3", linewidth=0) ax.set_xscale("log") sb.despine(trim=True) # plt.xlabel('images sorted by the average neuron distance of the median reconstruction') plt.savefig(output_dir + '/ts_compare_median_with_consensus_'+metric+'.png', format='png') if type =='lm': sb.lmplot(x="order", y=metric, hue="algorithm", data=df_order) plt.xlabel('images sorted by the average neuron distance of the median reconstruction') plt.savefig(output_dir + '/lm_compare_median_with_consensus_'+metric+'.lm.png', format='png') if DISPLAY: plt.show() plt.close()
def stripplot_mean_score(df, save_path, atlas=None, suffix=None, x=None, y=None, hue=None, style='whitegrid', fontsize=14, jitter=.2, figsize=(9, 3), leg_pos=2, axx=None): def change_label_name(row, label): row[label] = new_names[row[label]] return row ylabel = atlas aliases = {'kmeans': 'K-Means', 'ica': 'GroupICA', 'dictlearn': 'Dictionary Learning', 'basc': 'BASC'} if atlas == 'kmeans': new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) else: new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) # change the name of the dataset to upper df['dataset'] = df['dataset'].str.upper() # make labels of the y axes shorter # df[y] = df[y].str.wrap(13) rc('xtick', labelsize=12) rc('ytick', labelsize=16) rc('axes', labelweight='bold') # string.capitalize rc('legend', fontsize=fontsize) n_data = len(df['dataset'].unique()) palette = color_palette(n_data) # draw a default vline at x=0 that spans the yrange axx.axvline(x=0, linewidth=4, zorder=0, color='0.6') sns.boxplot(data=df, x=x, y=y, fliersize=0, linewidth=2, boxprops={'facecolor': '0.5', 'edgecolor': '.0'}, width=0.5, ax=axx) sns.stripplot(data=df, x=x, y=y, hue=hue, edgecolor='gray', size=5, split=True, palette=datasets_palette, jitter=jitter, ax=axx) axx.set_xlabel('') # axx.set_ylabel(aliases[ylabel], fontsize=15) axx.set_ylabel('') plt.text(.5, 1.02, aliases[key], transform=ax.transAxes, size=15, ha='center') # make the positive labels with "+" axx_xticklabels = [] for x in axx.get_xticks(): if x > 0: axx_xticklabels.append('+' + str(x) + '$\%$') else: axx_xticklabels.append(str(x) + '$\%$') axx.set_xticklabels(axx_xticklabels)
def CheckShannonIndex(self, labels=None, condition_dict=None, fig_title=None): # Description: calculate the Shannon entropy of all samples, and plot on boxplot # If labels is specified, also plot the entropy of samples in each of the labels. def ShannonIndex(numList): ## Calculate Shannon Entropy SU = sum(numList) SDI = 0.0 for num in numList: freq = float(num)/SU if freq>0: SDI = SDI - freq * np.log(freq) return SDI print('Making Shannon Diversity boxplot for all samples') # Calculate shannon entropy for each sample SDIs = pd.DataFrame(index=self.abun_df.index, columns=['SDI']) for sample in self.abun_df.index: SDIs.loc[sample, 'SDI'] = ShannonIndex(self.abun_df.loc[sample]) # Add metadata labels to the df containing SDIs SDIs = pd.concat([SDIs, self.meta_df], axis=1) SDIs['SDI'] = SDIs['SDI'].astype('float64') self.SDI = SDIs # Plot all boxplots, and save if fig_title was given if fig_title: fig_ext = fig_title.rsplit('.',1)[1] fig_title = fig_title.rsplit('.',1)[0] # First plot SDI of all samples if fig_title: ax = sb.violinplot(x=SDIs['SDI'], inner=None, saturation=0.35) ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_all.violinplot.' + fig_ext) plt.close() # Do the boxplot ax = sb.boxplot(x=SDIs['SDI']) ax = sb.stripplot(x=SDIs['SDI'], jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_all.boxplot.' + fig_ext) plt.close() if labels: print('Making boxplots separated by labels: ') for label in labels: print(label + '...') # Try with seaborn library SDIs[label] = SDIs[label].astype('category') ax = sb.violinplot(x=label, y='SDI', data=SDIs, saturation=0.35, inner=None) ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_' + label + '.violinplot.' + fig_ext) plt.close(fig) # Boxplot ax = sb.boxplot(x=label, y='SDI', data=SDIs, saturation=0.35) ax = sb.stripplot(x=label, y='SDI', data=SDIs, jitter=True, size=5, linewidth=0.6) fig = ax.get_figure() fig.savefig(fig_title + '_' + label + '.boxplot.' + fig_ext) plt.close()
def strip(X, y, description): '''for visualizing categorical data''' for i in X.iteritems(): feature_title = i[0] sns.stripplot(x=i[1], y=y, jitter=True) plt.savefig('visuals/'+feature_title+'_'+description+'_strips') print('visuals/'+feature_title+'_'+description+'_strips') plt.close()
def bar_box_violin_dot_plots(data, category_col, numeric_col, axes, file_name=None): sns.barplot(category_col, numeric_col, data=data, ax=axes[0]) sns.boxplot(category_col, numeric_col, data=data[data[numeric_col].notnull()], ax=axes[2]) sns.violinplot(category_col, numeric_col, data=data, kind='violin', inner="quartile", scale='count', split=True, ax=axes[3]) sns.stripplot(category_col, numeric_col, data=data, jitter=True, ax=axes[1]) sns.despine(left=True)
def plot_domestic_origin(df, predicted=None): ax = plt.subplot(111) ax.xaxis.set_major_formatter(tkr.FuncFormatter(lambda x, pos: ('%.0f')%(x*1e-6))) sns.stripplot(x="DomLifeGross", y="OriginC", data=df) sns.plt.xlabel("Domestic Lifetime Gross (millions)") sns.plt.ylabel("Country of Origin") sns.despine() sns.plt.show()
def conditionPlot(data): data = data.pivot_table('Values', ['Sample'], ['Gene', 'Condition']) answer = input("Do you want a boxplot to go with your data? (y/n): ") if answer == "y": sns.boxplot(data=data) sns.stripplot(data=data, size = 6, jitter = True, edgecolor = "black") plt.ylabel('Values') plt.xlabel('Gene/Condition') plt.show()
def plot_errors_for_elements(self, ax=None, **kwargs): """ Plot the relative errors associated to the chemical elements. """ dict_list = [] for idx, row in self.iterrows(): rerr = 100 * (row["this"] - row["ae"]) / row["ae"] for symbol in set(species_from_formula(row.formula)): dict_list.append(dict( element=symbol, rerr=rerr, formula=row.formula, struct_type=row.struct_type, )) frame = DataFrame(dict_list) order = sort_symbols_by_Z(set(frame["element"])) #print_frame(frame) import seaborn as sns ax, fig, plt = get_ax_fig_plt(ax=ax) # Draw violinplot #sns.violinplot(x="element", y="rerr", order=order, data=frame, ax=ax, orient="v") # Box plot ax = sns.boxplot(x="element", y="rerr", data=frame, ax=ax, order=order, whis=np.inf, color="c") # Add in points to show each observation sns.stripplot(x="element", y="rerr", data=frame, ax=ax, order=order, hue='struct_type', # jitter=True, size=5, color=".3", linewidth=0) jitter=0, size=4, color=".3", linewidth=0, palette=sns.color_palette("muted")) sns.despine(left=True) ax.set_ylabel("Relative error %") labels = ax.get_xticklabels() ticks = ax.get_xticks() ticks1 = range(min(ticks), max(ticks)+1, 2) ticks2 = range(min(ticks) + 1, max(ticks)+1, 2) labels1 = [labels[i].get_text() for i in ticks1] labels2 = [labels[i].get_text() for i in ticks2] # ax.tick_params(which='both', direction='out') #ax.set_ylim(-1, 1) ax.set_xticks(ticks1) ax.set_xticklabels(labels1, rotation=90) ax2 = ax.twiny() ax2.set_zorder(-1) ax2.set_xticks(ticks2) ax2.set_xticklabels(labels2, rotation=90) ax2.set_xlim(ax.get_xlim()) ax.grid(True) return fig
def plot_scatterBox(df,xData,yData,title,fileName,plotAspect=1,colorVal=None): plt.figure(figsize=(6*plotAspect,6)) if(colorVal): sns_plot = sns.boxplot(x=xData,y=yData,data=df,color=colorVal) sns.stripplot(x=xData,y=yData,size=9,data=df,color=colorVal,edgecolor='gray',linewidth=1) else: sns_plot = sns.boxplot(x=xData,y=yData,data=df) sns.stripplot(x=xData,y=yData,size=9,data=df,edgecolor='gray',linewidth=1) plt.title(title) fig = sns_plot.get_figure() process_plot(fileName)
def plot_feature_importance(features, fitted_forest): """Using a fitted random forest, make a cleveland dot plot of the computed feature importances. """ plt.figure() vals = fitted_forest.feature_importances_ sortorder = np.flipud(np.argsort(vals)) features = np.array(features) with sns.axes_style("whitegrid"): sns.stripplot(y=features[sortorder], x=vals[sortorder], orient="h", color='red', size=10) xl = plt.xlim() plt.xlim(0,xl[1]) plt.grid(axis='y',linestyle=':') plt.xlabel('Feature importance score')
def plot_hints(self, with_soc=False, **kwargs): # Build pandas dataframe with results. rows = [] for p in self: if not p.has_dojo_report: cprint("Cannot find dojo_report in %s" % p.basename, "magenta") continue report = p.dojo_report row = {att: getattr(p, att) for att in ("basename", "symbol", "Z", "Z_val", "l_max")} # Get deltafactor data with/without SOC df_dict = report.get_last_df_results(with_soc=with_soc) row.update(df_dict) for struct_type in ["fcc", "bcc"]: gbrv_dict = report.get_last_gbrv_results(struct_type, with_soc=with_soc) row.update(gbrv_dict) # Get the hints hint = p.hint_for_accuracy(accuracy="normal") row.update(dict(ecut=hint.ecut, pawecutdg=hint.pawecutdg)) rows.append(row) import pandas as pd frame = pd.DataFrame(rows) def print_frame(x): import pandas as pd with pd.option_context('display.max_rows', len(x), 'display.max_columns', len(list(x.keys()))): print(x) print_frame(frame) # Create axes #import matplotlib.pyplot as plt import seaborn as sns ax, fig, plt = get_ax_fig_plt(ax=None) #order = sort_symbols_by_Z(set(frame["element"])) # Box plot ax = sns.boxplot(x="symbol", y="ecut", data=frame, ax=ax, #order=order, whis=np.inf, color="c") # Add in points to show each observation sns.stripplot(x="symbol", y="ecut", data=frame, ax=ax, #order=order, jitter=True, size=5, color=".3", linewidth=0) sns.despine(left=True) ax.set_ylabel("Relative error %") ax.grid(True) return fig
def outcomeBoxplot(cyDf, cyVar, outcomeVar, printP=True, axh=None): if axh is None: axh = plt.gca() axh.cla() sns.boxplot(y=cyVar, x=outcomeVar, data=cyDf, ax=axh, order=[0,1]) sns.stripplot(y=cyVar, x=outcomeVar, data=cyDf, jitter=True, ax=axh, order=[0,1]) plt.xticks([0,1], ['False', 'True']) if printP: tmp = cyDf[[cyVar, outcomeVar]].dropna() z, pvalue = stats.ranksums(tmp[cyVar].loc[tmp[outcomeVar] == 1], tmp[cyVar].loc[tmp[outcomeVar] == 0]) annParams = dict(textcoords='offset points', xytext=(0,-5), ha='center', va='top', color='black', weight='bold', size='medium') plt.annotate('p = %1.3g' % pvalue, xy=(0.5,plt.ylim()[1]), **annParams) plt.show()
def plotCrossCompartmentBoxplot(cyDfA, cyDfB): rho,pvalue,qvalue = crosscorr(cyDfA[sorted(cyDfA.columns)], cyDfB[sorted(cyDfB.columns)]) s = [rho.loc[i,j] for i,j in itertools.product(rho.index, rho.columns) if i == j] d = [rho.loc[i,j] for i,j in itertools.product(rho.index, rho.columns) if i != j] a = pd.DataFrame({'Group':['Same']*len(s) + ['Different']*len(d), '$\\rho$':s+d}) plt.clf() sns.boxplot(x='Group', y='$\\rho$', data=a) sns.stripplot(x='Group', y='$\\rho$', data=a, jitter=True) plt.xlabel('') plt.ylim((-1,1)) plt.tight_layout()
def parseExcelManual(filename): df = pd.read_excel(filename) print("We found the following columns:") print(df.columns.values) print("") xcol = input("Which column would you like to occupy the x-axis?: ") while xcol not in df.columns.values: print("Invalid column name") xcol = input("Please enter a valid column name: ") ycol = input("Which column would you like to occupy the y-axis?: ") while ycol not in df.columns.values: print("Invalid column name") ycol = input("Please enter a valid column name: ") title = ycol + " vs " + xcol # making a new series, where xcol is the label, and then makes correspondence # between the label and the value d = {xcol: df[xcol], ycol: df[ycol]} # then put it into a data frame reorg = pd.DataFrame(data=d) # dropna removes anything thats without a value (NaN) reorg = reorg.dropna(axis = 0) # flips/trasnposes to make it easier to work with reorg = reorg.pivot_table(ycol, xcol, reorg.index) reorg = reorg.T box = input("Do you want a boxplot? (y/n): ") normalize = input("Do you want to normalize the y-axis? (y/n): ") if normalize == 'y': # amax is the max value of the values, fills NaN with zeroes reorg = reorg / np.amax(reorg.fillna(0).values) title = title + " Normalized" if box == "y": makeBoxplot(reorg) sns.stripplot(data=reorg, size = 7, jitter = True, palette = sns.color_palette("Set1", n_colors=8, desat=.9)) plt.ylabel(ycol) plt.xlabel(xcol) plt.title(title) plt.gca().set_ylim(bottom = 0) exportExcel(reorg); plt.show()
def Create_Plot(X,y): #Creates strip plot of x and y xlab = X.name ylab = y.name xlab = xlab.replace("_"," ") ylab = ylab.replace("_"," ") figlab = ylab + " vs " + xlab filelab = "Plots/" + figlab.replace(" ","") + ".pdf" f, ax = plt.subplots(figsize=(5, 5)) sns.stripplot(x = X, y = y, jitter = True, size = 5, linewidth = 0.1, ax = ax) sns.plt.title(figlab) sns.plt.xlabel(xlab) sns.plt.ylabel(ylab) savefig(filelab)
def BB_vs_Sidechain(): # Make bins for BB RMSDs number_of_bins = 5 bin_size = len(bb_vs_sidechain_df['WT-Mutant Backbone RMSD']) / number_of_bins + 1 # Assign arbitrary bin identifiers for BB Group for index, row in bb_vs_sidechain_df.iterrows(): bb_vs_sidechain_df.loc[index, 'BB Group'] = ((index + 1) // bin_size) # Find bin boundaries for BB group and add to dict bin_rename_dict = {} for name, group in bb_vs_sidechain_df.groupby('BB Group'): bin_rename_dict[name] = '%s -\n%s' % (group['WT-Mutant Backbone RMSD'].iloc[0], group['WT-Mutant Backbone RMSD'].iloc[len(group) - 1]) # Rename bin identifiers to bin boundary values in BB group for index, row in bb_vs_sidechain_df.iterrows(): bb_vs_sidechain_df.loc[index, 'BB Group'] = bin_rename_dict[bb_vs_sidechain_df.loc[index, 'BB Group']] # Assign bin identifiers for DDG Group for DDG_type in ['Experimental DDG', 'Predicted DDG']: for index, row in bb_vs_sidechain_df.iterrows(): if row[DDG_type] > 2.5 or row[DDG_type] < -2.5: bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Extra Large DDG (DGG > 2.5 REU or DDG < -2.5 REU)' elif row[DDG_type] > 1 or row[DDG_type] < -1: bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Large DDG (2.5 REU > DGG > 1 REU or -2.5 < DDG < -1 REU)' elif row[DDG_type] > 0.5 or row[DDG_type] < -0.5: bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Medium DDG (1 REU > DGG > 0.5 REU or -1 < DDG < -0.5 REU)' else: bb_vs_sidechain_df.loc[index, DDG_type + ' Group'] = 'Small DDG (0.5 REU > DDG > -0.5 REU)' sns.set_style('white', {'axes.grid': True, 'axes.edgecolor': '0'}) sns.set_context('paper', font_scale=1.5, rc={'lines.linewidth': 1}) fig, ax = plt.subplots(figsize=(20, 10)) fig.suptitle('WT PDB - Mutant PDB Neighborhood Backbone RMSD vs. \nMutant PDB - RosettaOut Point Mutant Residues All-Atom RMSD', fontsize = 24, y=1.0) with sns.cubehelix_palette(number_of_bins, start=0.5, rot=-.75): sns.boxplot(x=bb_vs_sidechain_df['BB Group'], y=bb_vs_sidechain_df['Point Mutant RMSD'], ax=ax ) with sns.color_palette("husl", number_of_bins): sns.stripplot(x='BB Group', y='Point Mutant RMSD', hue= DDG_type + ' Group', data=bb_vs_sidechain_df, jitter=True, ax=ax ) ax.set(xlabel='WT PDB - Mutant PDB Neighborhood Backbone RMSD', ylabel='Mutant PDB - RosettaOut Point Mutant Residues All-Atom RMSD') output_pdf.savefig(fig, pad_inches=1, bbox_inches='tight')
def pltvar(data, labels, stem): (xlabel, ylabel) = labels kwargs = { 'x': xlabel, 'y': 'deviation', 'data': df } sns.boxplot(palette="PRGn", whis=np.inf, **kwargs) sns.stripplot(jitter=True, size=3, color='.3', linewidth=0, **kwargs) ax = plt.gca() ax.set_xlabel(xlabel.title() + ' window (minutes)') ax.set_ylabel(ylabel.title() + ' window std. dev. (jams/day)') fname = '-'.join([ 'variance', xlabel, stem ]) dest = source.joinpath(fname).with_suffix('.png') plt.savefig(str(dest)) plt.close()
def plot_similardishes(idx,xlim): match = yum_ingr2.iloc[yum_cos[idx].argsort()[-21:-1]][::-1] newidx = match.index.get_values() match['cosine'] = yum_cos[idx][newidx] match['rank'] = range(1,1+len(newidx)) label1, label2 =[],[] for i in match.index: label1.append(match.ix[i,'cuisine']) label2.append(match.ix[i,'recipeName']) fig = plt.figure(figsize=(10,10)) ax = sns.stripplot(y='rank', x='cosine', data=match, jitter=0.05, hue='cuisine',size=15,orient="h") ax.set_title(yum_ingr2.ix[idx,'recipeName']+'('+yum_ingr2.ix[idx,'cuisine']+')',fontsize=18) ax.set_xlabel('Flavor cosine similarity',fontsize=18) ax.set_ylabel('Rank',fontsize=18) ax.yaxis.grid(color='white') ax.xaxis.grid(color='white') for label, y,x, in zip(label2, match['rank'],match['cosine']): ax.text(x+0.001,y-1,label, ha = 'left') ax.legend(loc = 'lower right',prop={'size':14}) ax.set_ylim([20,-1]) ax.set_xlim(xlim)
def log2_oulierfilter(df_by_cell, plot=False): log2_df = np.log2(df_by_cell+1) top_log2 = find_top_common_genes(log2_df) if top_log2.empty: print("no common genes found") return log2_df, log2_df.transpose() log2_df2= pd.DataFrame(pd.to_numeric(log2_df, errors='coerce')) log_mean = top_log2.mean(axis=0).sort_values(ascending=False) log2_sorted = top_log2.reindex_axis(top_log2.mean(axis=0).sort_values(ascending=False).index, axis=1) xticks = [] keep_col= [] log2_cutoff = np.average(log2_sorted)-np.std(log2_sorted) avg_cutoff = np.average(log2_cutoff) for col, m in zip(log2_sorted.columns.tolist(),log2_sorted.mean()): if m > avg_cutoff: keep_col.append(col) xticks.append(col+' '+str("%.2f" % m)) filtered_df_by_cell = df_by_cell[keep_col] filtered_df_by_gene = filtered_df_by_cell.transpose() filtered_log2 = np.log2(filtered_df_by_cell[filtered_df_by_cell>0]) if plot: ax = sns.boxplot(data=filtered_log2, whis= .75, notch=True) ax = sns.stripplot(x=filtered_log2.columns.values, y=filtered_log2.mean(axis=0), size=4, jitter=True, edgecolor="gray") xtickNames = plt.setp(ax, xticklabels=xticks) plt.setp(xtickNames, rotation=90, fontsize=9) plt.show() plt.clf() sns.distplot(filtered_log2.mean()) plt.show() log2_expdf_cell = np.log2(filtered_df_by_cell+1) log2_expdf_gene = log2_expdf_cell.transpose() return log2_expdf_cell, log2_expdf_gene
def plot_results( df, reg_weight_col, out_dir, dataset, ): fig_violin, axes_violin = plt.subplots(nrows=3, ncols=4, figsize=(30, 30)) fig_box, axes_box = plt.subplots(nrows=3, ncols=4, figsize=(30, 30)) fig_mean, axes_mean = plt.subplots(nrows=3, ncols=4, figsize=(30, 30)) for metric, ax_violin, ax_box, ax_mean in zip( DIS_METRICS, axes_violin.flatten(), axes_box.flatten(), axes_mean.flatten(), ): metric_df, metric = get_metric_df(df, metric) print() print( metric_df.groupby(MODEL_COL_STR) [metric].mean().reset_index().sort_values(metric, ascending=False)) metric_df = metric_df.sort_values(reg_weight_col) sns.violinplot( x=reg_weight_col, y=metric, data=metric_df, cut=0, ax=ax_violin, ) for tick in ax_violin.get_xticklabels(): tick.set_rotation(45) sns.boxplot( x=reg_weight_col, y=metric, data=metric_df, ax=ax_box, ) for tick in ax_box.get_xticklabels(): tick.set_rotation(45) # group and aggregate to obtain means per model metric_df = metric_df.groupby(reg_weight_col)[metric].mean() sns.stripplot( x=list(map("{:.2E}".format, metric_df.index.values)), y=metric_df.values, ax=ax_mean, size=25, ) ax_mean.set_ylabel(metric) for tick in ax_mean.get_xticklabels(): tick.set_rotation(45) fig_violin.savefig(out_dir / f'{dataset}_violin.png') fig_box.savefig(out_dir / f'{dataset}_box.png') fig_mean.savefig(out_dir / f'{dataset}_mean.png') for fig in (fig_violin, fig_box, fig_mean): plt.close(fig)
sns.palplot(sns.cubehelix_palette(n_colors=8, start=1.7, rot=0.2, dark=0, light=.95, reverse=True)) # *start* is always between 0 and 3. *rot* an abbreviation for rotation is kept between -1 and 1. *reverse* converses the color ordering and *hue* refers to plot appearance. # ## Generic Seaborn Plots: # In[36]: # Loading up built-in dataset: tips = sns.load_dataset("tips") # Creating Strip plot for day-wise revenue: sns.stripplot(x="day", y="total_bill", data=tips, color="g") # This does the job for us but let us try to get better results by plotting each day in different color instead of same color. For this, we shall replace `color` parameter with `palette` parameter: # In[40]: # Set Theme: sns.set_style('whitegrid') # Creating Strip plot for day-wise revenue: sns.swarmplot(x="day", y="total_bill", data=tips, palette="viridis") # In[ ]:
try: if len(item_box.find_elements_by_css_selector(".item-sold-out-badge")) > 0: sold = "SOLD" else: sold = "NOT SOLD" sub_title = item_box.find_element_by_class_name("items-box-body") title = sub_title.find_element_by_tag_name("h3").text item_price = item_box.find_element_by_css_selector(".items-box-price") price_text = item_price.text price_text = re.sub(r",", "", price_text).lstrip("¥ ") price_text_int = int(price_text) print(price_text_int) url = item_box.find_element_by_tag_name("a").get_attribute("href") data = pd.Series( [ sold,title,price_text_int,url ], index=df_main.columns ) grdata = pd.Series( [ sold,price_text_int ], index=df_graf.columns ) df_main = df_main.append( data, ignore_index=True ) df_graf = df_graf.append( grdata, ignore_index=True ) except Exception as e: print(e) else: print('No items anymore...') break print(df_main) sns.stripplot(x='SOLD', y='PRICE', data=df_graf) plt.show() sns.pairplot(df_graf,hue="SOLD") plt.show() print('Writing out to CSV file...') df_main.to_csv("pricedata.csv", encoding="utf_8_sig") print("Done")
plt.plot(np.array(all_struct_voxels), np.array(all_struct_voxels), color = "gray", linestyle = "dashdot", linewidth = 1) # identity line plt.ylabel("Voxels in 80um eroded volume") plt.xlabel("Voxels in original volume") plt.xlim([0,250000]);plt.ylim([0, 150000]) plt.savefig(os.path.join(fig_dst, "voxels_scatter_org_vs_eroded_250000_voxels.pdf"), bbox_inches = "tight") #%% missing_struct_voxels_sort = np.sort(np.array(missing_struct_voxels)) missing_struct_names_sort = np.array(missing_struct_names)[np.argsort(np.array(missing_struct_voxels))] df = pd.DataFrame() df["num_voxels"] = missing_struct_voxels+all_struct_voxels df["type"] = ["eroded"]*len(missing_struct_voxels) + ["original"]*len(all_struct_voxels) sns.stripplot(x = "num_voxels", y = "type", data = df, color = "crimson", orient = "h") sns.boxplot(x = "num_voxels", y = "type", data = df, orient = "h", showfliers=False, showcaps=False, boxprops={'facecolor':'None'}) plt.xlim([0, 200000]) plt.xlabel("Total number of voxels in structure") plt.ylabel("Structures 'zero'ed' out vs. all original structures") plt.savefig(os.path.join(fig_dst, "boxplot_total_voxels_org_vs_eroded.pdf"), bbox_inches = "tight") #%% #export missing structures name, id, and total voxel count dataf = pd.DataFrame() dataf["name"] = missing_struct_names dataf["id"] = missing_struct_ids dataf["parent_name"] = missing_struct_parents dataf["voxels_in_structure"] = missing_struct_voxels
return [] return insert_packet(spreading_factor - 1) + [spreading_factor] + insert_packet(spreading_factor - 1) sf_as_category = pd.Categorical(insert_packet(12), categories=[7, 8, 9, 10, 11, 12], ordered=True) pyramid = pd.DataFrame({'SF': sf_as_category}) pyramid['seq_num'] = pyramid.index cmap = sns.color_palette('Blues_d', 6) fig, ax = plt.subplots(figsize=(4, 3)) plot = sns.scatterplot(x=pyramid.index, y='SF', data=pyramid, hue='SF', legend=False, palette=cmap, ax=ax) plot.set_title('Spreading factor sequence') plot.set_ylabel('spreading factor') plot.set_xlabel('sequence number') fig.savefig("sf-sequence.svg") lora_mons_static = pd.read_pickle('data/lora_mons_static_clean.pkl.gz') channel = lora_mons_static.query('gtw_id == "eui-0000024b08030186"')[['received', 'dev_id', 'rssi', 'snr', 'data_rate']].set_index('received').sort_index() channel.index = channel.index.tz_convert('Europe/Brussels') channel['spreading_factor'] = channel['data_rate'].str.extract('SF([0-9]+)BW').astype(dtype=np.int64) ax = sns.stripplot(x='spreading_factor', y='rssi', data=channel, alpha=0.3) ax.set(ylabel='RSSI (dBm)', xlabel='Spreading Factor', title='Distribution of received packets RSSI'); ax.figure.savefig('rssi_sf.png') ax = sns.stripplot(x='spreading_factor', y='snr', data=channel, alpha=0.3) ax.set(ylabel='SNR (dB)', xlabel='Spreading Factor', title='Distribution of received packets SNR'); ax.figure.savefig('snr_sf.png') ax = sns.scatterplot(x='snr', y='rssi', data=channel, alpha=0.3) ax.set(xlabel='SNR (dB)', ylabel='RSSI (dBm)'); ax.figure.savefig('rssi_snr.png')
fig.suptitle(f"{name} ({n_verts})", fontsize=40, y=1.04) plt.tight_layout() stashfig(f"{g}-gridplot-sf-sorted") print() #%% shuffle_df = pd.DataFrame(shuffled_triu_outs) true_df = pd.DataFrame(true_triu_outs) fig, ax = plt.subplots(1, 1, figsize=(10, 6)) ax = sns.stripplot( data=shuffle_df, x="Graph", y="Proportion", linewidth=1, alpha=0.4, jitter=0.3, size=5, ax=ax, ) # ax = sns.violinplot(data=shuffle_df, x="Graph", y="Proportion", ax=ax) ax = sns.stripplot( data=true_df, x="Graph", y="Proportion", marker="_", linewidth=2, s=90, ax=ax, label="True",
pickle.dump(model,f) training['prob']=model.predict_proba(training[features])[:,1] testing['prob']=model.predict_proba(testing[features])[:,1] oot_data['prob']=model.predict_proba(oot_data[features])[:,1] build_cut,bins = model_method.ks_lift_chart(training['y'],training['prob'],'train') test_cut=model_method.ks_lift_chart(testing['y'],testing['prob'],'testing',bins=bins) oot_cut = model_method.ks_lift_chart(oot_data['y'],oot_data['prob'],'oot',bins=bins) month_ks=model_method.month_ks(testing,'app_date') feature_results = var_cut.get_feature_result(training[features+['y']],'y') importance_df = model_method.get_xgboost_importances(model,return_df=True) data_var = model_method.var_avg_plot([training,testing,oot_data],importance_df.index.tolist()[:10],q=10) model_method.var_lift_plot(training['y'],training['xx2337'],'xx2337') model_method.var_cut_plot([training,testing],importance_df.index.tolist()[:10],q=10) var_psi_ie = model_method.var_psi_chart(training,testing,importance_df.index.tolist()[:10],'app_date') #model_method.get_plot_tree(model) #PSI(testing[testing['y']==1]['prob'],training[training['y']==1]['prob']) #for i in importance_df.index.tolist()[:10]: # print(PSI(testing[i],training[i])) import seaborn as sns sns.set_style('whitegrid') sns.stripplot(x='app_date',y='xx2392',hue='y',data=training,jitter = True,dodge=True)
def taxa_abundance_box_plot( taxa, metadata=None, hue=None, hue_order=None, add_datapoints=False, level=1, by=None, ax=None, figsize=None, count=0, exclude_samples=None, include_samples=None, exclude_taxa=None, sort_by_names=False, sample_names=None, csv_file=None, size=5, pseudocount=False, taxa_names=None, brief_xlabels=False, show_means=False, meanprops=None, show_others=True, sort_by_mean=True, jitter=1, alpha=None, artist_kwargs=None ): """Create a taxa abundance box plot. +----------------+-----------------------------------------------------+ | q2-taxa plugin | Example | +================+=====================================================+ | QIIME 2 CLI | qiime taxa barplot [OPTIONS] | +----------------+-----------------------------------------------------+ | QIIME 2 API | from qiime2.plugins.taxa.visualizers import barplot | +----------------+-----------------------------------------------------+ Parameters ---------- taxa : str or qiime2.Visualization Visualization file or object from the q2-taxa plugin. metadata : str or qiime2.Metadata, optional Metadata file or object. hue : str, optional Grouping variable that will produce boxes with different colors. hue_order : list, optional Specify the order of categorical levels of the 'hue' semantic. add_datapoints : bool, default: False Show datapoints on top of the boxes. level : int, default: 1 Taxonomic level at which the features should be collapsed. by : list, optional Column name(s) to be used for sorting the samples. Using 'sample-id' will sort the samples by their name, in addition to other column name(s) that may have been provided. If multiple items are provided, sorting will occur by the order of the items. ax : matplotlib.axes.Axes, optional Axes object to draw the plot onto, otherwise uses the current Axes. figsize : tuple, optional Width, height in inches. Format: (float, float). count : int, default: 0 The number of taxa to display. When 0, display all. exclude_samples : dict, optional Filtering logic used for sample exclusion. Format: {'col': ['item', ...], ...}. include_samples : dict, optional Filtering logic used for sample inclusion. Format: {'col': ['item', ...], ...}. exclude_taxa : list, optional The taxa names to be excluded when matched. Case insenstivie. sort_by_names : bool, default: False If true, sort the columns (i.e. species) to be displayed by name. sample_names : list, optional List of sample IDs to be included. csv_file : str, optional Path of the .csv file to output the dataframe to. size : float, default: 5.0 Radius of the markers, in points. pseudocount : bool, default: False Add pseudocount to remove zeros. taxa_names : list, optional List of taxa names to be displayed. brief_xlabels : bool, default: False If true, only display the smallest taxa rank in the x-axis labels. show_means : bool, default: False Add means to the boxes. meanprops : dict, optional The meanprops argument as in matplotlib.pyplot.boxplot. show_others : bool, default: True Include the 'Others' category. sort_by_mean : bool, default: True Sort taxa by their mean relative abundance after sample filtration. jitter : float, default: 1 Amount of jitter (only along the categorical axis) to apply. alpha : float, optional Proportional opacity of the points. artist_kwargs : dict, optional Keyword arguments passed down to the _artist() method. Returns ------- matplotlib.axes.Axes Axes object with the plot drawn onto it. See Also -------- taxa_abundance_bar_plot addpairs Examples -------- Below is a simple example showing taxonomic abundance at the phylum level (i.e. ``level=2``). >>> qzv_file = '/Users/sbslee/Desktop/dokdo/data/moving-pictures-tutorial/taxa-bar-plots.qzv' >>> dokdo.taxa_abundance_box_plot(qzv_file, level=2, figsize=(8, 7)) >>> plt.tight_layout() .. image:: images/taxa_abundance_box_plot-1.png We can control how many taxa to display with ``count``. Also, we can make the x-axis tick labels pretty with ``brief_xlabels``. We can manually set the x-axis tick labels with ``xticklabels``. Lastly, we can select specific taxa to display with ``taxa_names``. >>> fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(10, 10)) >>> kwargs = {'level' : 2} >>> artist_kwargs1 = dict(title='count=4') >>> artist_kwargs2 = dict(title='brief_xlabels=True') >>> artist_kwargs3 = dict(xticklabels=['A', 'B', 'C', 'D'], title="xticklabels=['A', 'B', 'C', 'D']") >>> artist_kwargs4 = dict(title="taxa_names=[...]") >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax1, count=4, artist_kwargs=artist_kwargs1, **kwargs) >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax2, count=4, brief_xlabels=True, artist_kwargs=artist_kwargs2, **kwargs) >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax3, count=4, artist_kwargs=artist_kwargs3, **kwargs) >>> dokdo.taxa_abundance_box_plot(qzv_file, ax=ax4, taxa_names=['k__Bacteria;p__Firmicutes', 'k__Bacteria;p__Proteobacteria'], artist_kwargs=artist_kwargs4, **kwargs) >>> plt.tight_layout() .. image:: images/taxa_abundance_box_plot-2.png We can group the boxes by a metadata column with ``hue``. For this plot, we will draw the y-axis in log scale with ``ylog``. To do this, we actually need to adjust the y-axis limits with ``ymin`` and ``ymax``, and also add a pseudocount of 1 to remove 0s with ``pseudocount`` (because 0s cannot be shown in log scale). We will also add data points with ``add_datapoints=True``. >>> artist_kwargs = dict(ylog=True, ymin=0.05, ymax=200, show_legend=True) >>> dokdo.taxa_abundance_box_plot(qzv_file, ... level=2, ... figsize=(10, 7), ... hue='body-site', ... size=3, ... count=4, ... pseudocount=True, ... add_datapoints=True, ... artist_kwargs=artist_kwargs) >>> plt.tight_layout() .. image:: images/taxa_abundance_box_plot-3.png """ with tempfile.TemporaryDirectory() as t: _parse_input(taxa, t) df = pd.read_csv(f'{t}/level-{level}.csv', index_col=0) # If provided, update the metadata. if metadata is None: pass else: mf = dokdo.get_mf(metadata) cols = _get_mf_cols(df) df.drop(columns=cols, inplace=True) df = pd.concat([df, mf], axis=1, join='inner') df["sample-id"] = df.index # If provided, sort the samples for display in the x-axis. if by: df = df.sort_values(by=by) # If provided, exclude the specified taxa. if isinstance(exclude_taxa, list): dropped = [] for tax in exclude_taxa: for col in df.columns: if tax.lower() in col.lower(): dropped.append(col) dropped = list(set(dropped)) df = df.drop(columns=dropped) # Remove the metadata columns. cols = _get_mf_cols(df) mf = df[cols] df = df.drop(columns=cols) df, mf = _filter_samples(df, mf, exclude_samples, include_samples) # If provided, only include the specified samples. if isinstance(sample_names, list): df = df.loc[sample_names] mf = mf.loc[sample_names] if sort_by_mean: df = _sort_by_mean(df) if ax is None: fig, ax = plt.subplots(figsize=figsize) # Add a pseudocount. if pseudocount: df = df + 1 # Convert counts to proportions. df = df.div(df.sum(axis=1), axis=0) df = _get_others_col(df, count, taxa_names, show_others) if sort_by_names: df = df.reindex(sorted(df.columns), axis=1) _taxa_names = df.columns df = df * 100 if hue is not None: df2 = pd.concat([df, mf[hue]], axis=1, join='inner') df2 = pd.melt(df2, id_vars=[hue]) else: df2 = pd.melt(df) if meanprops: _meanprops = meanprops else: _meanprops={'marker':'x', 'markerfacecolor':'white', 'markeredgecolor':'white', 'markersize':'10'} d = {} if show_means: d['showmeans'] = True d['meanprops'] = _meanprops sns.boxplot(x='variable', y='value', hue=hue, hue_order=hue_order, data=df2, ax=ax, **d) if add_datapoints: remove_duplicates = True # Alternative method: sns.swarmplot() sns.stripplot(x='variable', y='value', hue=hue, hue_order=hue_order, data=df2, ax=ax, color='black', size=size, dodge=True, jitter=jitter, alpha=alpha) else: remove_duplicates = False # If provided, output the dataframe as a .csv file. if csv_file is not None: df3 = pd.concat([df, mf], axis=1, join='inner') df3.to_csv(csv_file) if brief_xlabels: xticklabels = [dokdo.pname(x.get_text()) for x in ax.get_xticklabels()] else: xticklabels = None if artist_kwargs is None: artist_kwargs = {} artist_kwargs = {'xrot': 45, 'xha': 'right', 'xlabel': '', 'ylabel': 'Relative abundance (%)', 'xticklabels': xticklabels, 'remove_duplicates': remove_duplicates, **artist_kwargs} if hue is not None: artist_kwargs['legend_title'] = hue ax = _artist(ax, **artist_kwargs) return ax
warnings.filterwarnings('ignore') plt.rcParams['figure.figsize'] = (20, 10) plt.style.use('fivethirtyeight') sns.boxplot(df_hair_dryer['star_rating'], df_hair_dryer['length'], palette = 'Blues') plt.title("Relations between Review Length and Star Rating", fontsize = 50) plt.show() # Stripplot warnings.filterwarnings('ignore') plt.rcParams['figure.figsize'] = (20, 10) plt.style.use('fivethirtyeight') plt.xlabel('star_rating', fontsize = 50) plt.ylabel('review_length', fontsize = 50) plt.xticks(fontsize=40) plt.yticks(fontsize=40) sns.stripplot(df_hair_dryer['star_rating'], df_hair_dryer['length'], palette = 'Reds') plt.title("Relations between Review Length and Star Rating", fontsize = 60) plt.show() ''' ---------------------------------2---------------------------------- ''' ''' ------------------------------Part a-------------------------------- ''' # Cleaning the reviews import re import nltk nltk.download('stopwords')
for l in range(len(links)): for r in range(num_runs): if 1 in (surrogates[l, r, :] > TE[l, r]): p_vals[l, r] = 1-(np.argmax(surrogates[l, r, :] > TE[l, r])/num_surrogates) else: p_vals[l, r] = 0 print(p_vals) p_vals = np.delete(p_vals, obj = 1, axis = 1) print(p_vals) fig, axs = plt.subplots(figsize = (6, 6)) #sns.boxplot(data = np.transpose(p_vals[:, :]), palette = "Set3", linewidth = 2, width = 0.5, fliersize = 4) sns.boxplot(data = np.transpose(p_vals[:, :]), palette = "colorblind", linewidth = 4, width = 0.5, fliersize = 0) sns.stripplot(data = np.transpose(p_vals[:, :]), palette = "colorblind", linewidth = 3, size = 10) plt.hlines(0.05, -0.5, 5.5, color = "black", linewidth = 2, linestyle='--') plt.xticks([0, 1, 2, 3, 4, 5], LINKS) #plt.xlabel("connection") plt.ylabel("p value") plt.ylim([-0.1, 1.19]) #for i in [0, 1, 2, 3, 5]: for i in range(6): plt.scatter(i, 1.1, s=1000, c='green', marker='$✓$') #for i in [4]: # plt.scatter(i, 1.1, s=1000, c='red', marker='$×$') plt.tight_layout()
def df_function(collection_df, attribute, ax): sns.stripplot(x=attribute, y=SCORE, hue=split, data=collection_df, order=sorted(collection_df[attribute].unique()), jitter=1, dodge=True, alpha=0.5, ax=ax)
def df_function(collection_df, ax): hue_order_option = {'hue_order': sorted(collection_df[split].unique())} if split else {} sns.stripplot(x=attribute, y=SCORE, hue=split, data=collection_df, order=sorted(collection_df[attribute].unique()), **hue_order_option, jitter=1, dodge=True, alpha=0.5, ax=ax)
ax = axs[row, 0] # sns.violinplot( # data=neuron_df[neuron_df["neuron_type"].isin(row_neuron_types)], # x="neuron_type", # y=f"component_score_{i}", # hue="neuron_type", # palette=neuron_type_palette, # ax=ax, # inner=None, # ) sns.stripplot( data=neuron_df[neuron_df["neuron_type"].isin(row_neuron_types)], x="neuron_type", y=f"component_score_{i}", hue="neuron_type", hue_order=row_neuron_types, # ensures sorting stays the same order=row_neuron_types, # ensures sorting stays the same palette=neuron_type_palette, ax=ax, s=2, ) ax.get_legend().remove() ax.set(xlim=(-1, n_per_row), ylim=(y_min, y_max), xlabel="", ylabel="", yticks=[]) ax.axhline(0, color="black", linestyle=":", linewidth=1) ax.tick_params(length=0) plt.setp(ax.get_xticklabels(), rotation=45) for tick in ax.get_xticklabels():
def visulaization(cv_df): fig, ax = plt.subplots(figsize=(30,30)) sns.boxplot(x='model_name', y='accuracy', data=cv_df) sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2) plt.show()
density = density[:, 1:] counts_per_struct = counts_per_struct[1:, :] #%% #boxplots for counts import seaborn as sns #first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts order = np.argsort(np.median(counts_per_struct.T, axis=0))[::-1] sois_sort = np.array(nuclei)[order][:10] #boxplots of percent counts plt.figure(figsize=(5, 4)) df = pd.DataFrame(pcounts) df.columns = nuclei g = sns.stripplot(data=df, color="dimgrey", orient="h", order=sois_sort) sns.boxplot(data=df, orient="h", showfliers=False, showcaps=False, boxprops={'facecolor': 'None'}, order=sois_sort) plt.xlabel("# Neurons") plt.ylabel("Subnucleus") plt.savefig(os.path.join(fig_dst, "thal_counts_boxplots.pdf"), bbox_inches="tight") #%% #boxplots of density #first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts
#explore the len of categorical variable Studio, used in the assignment len(mov.Genre.unique() #filter the dataframe by genre mov2 = mov[(mov.Genre == 'action') | (mov.Genre == 'adventure') | (mov.Genre == 'animation') | (mov.Genre == 'comedy') | (mov.Genre == 'drama')] #filter the mov2 dataframe by studio mov3 = mov2[(mov2.Studio == 'Buena Vista Studios') | (mov2.Studio == 'Fox') | (mov2.Studio == 'Paramount Pictures') | (mov2.Studio == 'Sony') | (mov2.Studio == 'Universal') | (mov2.Studio == 'WB')] #check how the filters worked print (mov3.Genre.unique()) print (mov3.Studio.unique()) print (len(mov3)) #define the style sns.set(style="darkgrid", palette="muted", color_codes=True) #plot the boxsplots ax = sns.boxplot(data=mov3, x='Genre', y='Gross % US', orient='v', color='lightgray', showfliers=False) plt.setp(ax.artists, alpha=0.5) #add in points to show each observation sns.stripplot(x='Genre', y='Gross % US', data=mov3, jitter=True, size=6, linewidth=0, hue = 'Studio', alpha=0.7) ax.axes.set_title('Domestic Gross % by Genre',fontsize=30) ax.set_xlabel('Genre',fontsize=20) ax.set_ylabel('Gross % US',fontsize=20) #define where to place the legend ax.legend(bbox_to_anchor=(1.05, 1), loc=2)
}) ObjectiveC = pd.DataFrame({ 'Linguagem de Programação': np.repeat('Objective-C', 40), 'Quantidade de Palavras': (322, 443, 446, 462, 710, 219, 446, 463, 461, 461, 764, 1059, 37, 446, 446, 37, 446, 37, 39, 866, 462, 446, 37, 446, 666, 462, 461, 446, 461, 39, 462, 443, 37, 443, 8, 446, 446, 461, 324, 461) }) df = MATLAB.append(Julia).append(Clojure).append(Perl).append(ObjectiveC) # boxplot ax = sns.boxplot(x='Linguagem de Programação', y='Quantidade de Palavras', data=df) # add stripplot ax = sns.stripplot(x='Linguagem de Programação', y='Quantidade de Palavras', data=df, color="orange", jitter=0.2, size=2.5) # add title plt.title( "Boxplot da contagem de palavras das 5 linguagens de programação com menos códigos de conduta", loc="left") # show the graph plt.show()
from pydataset import data import seaborn as sns df = data('mtcars') df #%%quantiles intervals = np.linspace(0, 1, 11) intervals df.mpg.sort_values() np.sort(df.mpg)[16] df.quantile(q=0.5, axis=0) #columns df.quantile(q=intervals, axis=0) #columns df.boxplot() df.boxplot(column=['mpg']) ax = sns.stripplot(x="gear", y="mpg", data=df) #quantiles q3, q1 = np.percentile(df['hp'], [75, 25]) q3, q1 q3 - q1 from scipy import stats IQR = stats.iqr(df['hp']) IQR #define function to calculate interquartile range def find_iqr(x): return np.subtract(*np.percentile(x, [75, 25]))
412, 413, 414, 415, 417, 418, 419, 421, 422, 423, 425, 426, 426, 427, 427, 429, 430, 431, 432, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 444, 445, 446, 447, 448, 449, 450, 453, 454, 455, 457, 458, 459, 460, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 540 ] df = pd.DataFrame(pd_data) df["targeted"] = [int(y) in our_loci for y in [x[12:-12] for x in df["locus"]]] # df["evalue"].plot.kde() # plt.xlim(0.1,0) # plt.savefig("./tblastx_evals.png") # plt.clf() #evalue distribution sns.stripplot(data=df, x="targeted", y="evalue", alpha=0.5) plt.yscale('log', nonpositive='clip') plt.tight_layout() plt.savefig("./tblastx_evals.png") plt.clf() #bitscore distribution sns.stripplot(data=df, x="targeted", y="bitscore", alpha=0.5) plt.tight_layout() plt.savefig("./tblastx_bitscores.png") plt.clf()
X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5), return_estimator=True, n_jobs=-1, ) coefs = pd.DataFrame( [ est.named_steps["transformedtargetregressor"].regressor_.coef_ * X_train_preprocessed.std(axis=0) for est in cv_model["estimator"] ], columns=feature_names, ) plt.figure(figsize=(9, 7)) sns.stripplot(data=coefs, orient="h", color="k", alpha=0.5) sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5) plt.axvline(x=0, color=".5") plt.xlabel("Coefficient importance") plt.title("Coefficient importance and its variability") plt.subplots_adjust(left=0.3) # %% # The problem of correlated variables # ----------------------------------- # # The AGE and EXPERIENCE coefficients are affected by strong variability which # might be due to the collinearity between the 2 features: as AGE and # EXPERIENCE vary together in the data, their effect is difficult to tease # apart. #
Cars.head(10) Cars.describe() Cars.choice.value_counts() # Boxplot of independent variable distribution for each category of choice sns.boxplot(x="choice", y="cost.car", data=Cars) sns.boxplot(x="choice", y="cost.carpool", data=Cars) sns.boxplot(x="choice", y="cost.bus", data=Cars) sns.boxplot(x="choice", y="cost.rail", data=Cars) sns.boxplot(x="choice", y="time.car", data=Cars) sns.boxplot(x="choice", y="time.bus", data=Cars) sns.boxplot(x="choice", y="time.rail", data=Cars) # Scatter plot for each categorical choice of car sns.stripplot(x="choice", y="cost.car", jitter=True, data=Cars) sns.stripplot(x="choice", y="cost.carpool", jitter=True, data=Cars) sns.stripplot(x="choice", y="cost.carpool", jitter=True, data=Cars) sns.stripplot(x="choice", y="cost.rail", jitter=True, data=Cars) sns.stripplot(x="choice", y="time.cars", jitter=True, data=Cars) sns.stripplot(x="choice", y="time.bus", jitter=True, data=Cars) sns.stripplot(x="choice", y="time.rail", jitter=True, data=Cars) # Scatter plot between each possible pair of independent variable and also histogram for each independent variable sns.pairplot( Cars, hue="choice" ) # With showing the category of each car choice in the scatter plot sns.pairplot(Cars) # Normal # Correlation values between each independent features Cars.corr()
def plotProfileDataDuration(profileData, night, valueCat): fig, axes = plt.subplots(nrows=5, ncols=6, figsize=(14, 12)) row = 0 col = 0 fig.suptitle(t="{} of events (night {})".format(valueCat, night), y=1.2, fontweight='bold') #plot the data for each behavioural event for behavEvent in behaviouralEventOneMouse[:-2]: event = behavEvent + valueCat print("event: ", event) profileValueDictionary = getProfileValues(profileData=profileData, night=night, event=event) y = profileValueDictionary["value"] x = profileValueDictionary["genotype"] genotypeType = Counter(x) group = profileValueDictionary["exp"] print("y: ", y) print("x: ", x) print("group: ", group) experimentType = Counter(group) print("Nb of experiments: ", len(experimentType)) axes[row, col].set_xlim(-0.5, 1.5) axes[row, col].set_ylim(min(y) - 0.2 * max(y), max(y) + 0.2 * max(y)) sns.stripplot(x, y, jitter=True, hue=group, s=5, ax=axes[row, col]) axes[row, col].set_title(behavEvent) axes[row, col].set_ylabel("{} (frames)".format(valueCat)) axes[row, col].legend().set_visible(False) axes[row, col].spines['right'].set_visible(False) axes[row, col].spines['top'].set_visible(False) if col < 5: col += 1 row = row else: col = 0 row += 1 #plot the data for the total distance traveled profileValueDictionary = getProfileValues(profileData=profileData, night=night, event="totalDistance") y = profileValueDictionary["value"] x = profileValueDictionary["genotype"] genotypeType = Counter(x) group = profileValueDictionary["exp"] print("y: ", y) print("x: ", x) print("group: ", group) experimentType = Counter(group) print("Nb of experiments: ", len(experimentType)) axes[row, col].set_xlim(-0.5, 1.5) axes[row, col].set_ylim(min(y) - 0.2 * max(y), max(y) + 0.2 * max(y)) sns.stripplot(x, y, jitter=True, hue=group, s=5, ax=axes[row, col]) axes[row, col].set_title("Activity") axes[row, col].set_ylabel("total distance (m)") axes[row, col].legend().set_visible(False) axes[row, col].spines['right'].set_visible(False) axes[row, col].spines['top'].set_visible(False) if col < 7: col += 1 row = row else: col = 0 row += 1 fig.tight_layout() fig.savefig("FigProfile{}_Events_night_{}.pdf".format(valueCat, night), dpi=100) plt.close(fig)
Y # In[61]: import seaborn as sns import matplotlib.pyplot as plt plt.figure(figsize=(20,15), facecolor='white') plotnumber = 1 for column in X: if plotnumber<=len(X) : ax = plt.subplot(3,3,plotnumber) sns.stripplot(Y,X[column]) plotnumber+=1 plt.show() # In[45]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=.30,random_state=355) # In[46]: from sklearn.tree import DecisionTreeClassifier
#!/usr/bin/env python # coding: utf-8 # In[1]: import seaborn as sns sns.set_theme(style="whitegrid") tips = sns.load_dataset("tips") ax = sns.stripplot(x=tips["total_bill"]) # In[70]: sns.__version__ # In[16]: import pandas as pd # In[46]: tips[tips.day == "Sun"] # In[57]: rec = tips.iloc[[77, 90, 19, 1]] # In[58]: # In[59]: type(tips)
def rank_genes_groups_violin(adata, groups=None, n_genes=20, use_raw=None, split=True, scale='width', strip=True, jitter=True, size=1, computed_distribution=False, ax=None, show=None, save=None): """Plot ranking of genes for all tested comparisons. Parameters ---------- adata : :class:`~scanpy.api.AnnData` Annotated data matrix. groups : list of `str`, optional (default: `None`) List of group names. n_genes : `int`, optional (default: 20) Number of genes to show. use_raw : `bool`, optional (default: `None`) Use `raw` attribute of `adata` if present. Defaults to the value that was used in :func:`~scanpy.api.tl.rank_genes_groups`. split : `bool`, optional (default: `True`) Whether to split the violins or not. scale : `str` (default: 'width') See `seaborn.violinplot`. strip : `bool` (default: `True`) Show a strip plot on top of the violin plot. jitter : `int`, `float`, `bool`, optional (default: `True`) If set to 0, no points are drawn. See `seaborn.stripplot`. size : `int`, optional (default: 1) Size of the jitter points. computed_distribution : `bool`, optional (default: `False`) Set to `True` if you want to use the scaled and shifted distribution previously computed with the `compute_distribution` in :func:`scanpy.api.tl.rank_genes_groups` show : `bool`, optional (default: `None`) Show the plot, do not return axis. save : `bool` or `str`, optional (default: `None`) If `True` or a `str`, save the figure. A string is appended to the default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}. ax : `matplotlib.Axes`, optional (default: `None`) A `matplotlib.Axes` object. """ from ..tools import rank_genes_groups groups_key = str(adata.uns['rank_genes_groups']['params']['groupby']) if use_raw is None: use_raw = bool(adata.uns['rank_genes_groups']['params']['use_raw']) reference = str(adata.uns['rank_genes_groups']['params']['reference']) groups_names = (adata.uns['rank_genes_groups']['names'].dtype.names if groups is None else groups) if isinstance(groups_names, str): groups_names = [groups_names] for group_name in groups_names: keys = [] gene_names = adata.uns['rank_genes_groups']['names'][ group_name][:n_genes] if computed_distribution: for gene_counter, gene_name in enumerate(gene_names): identifier = rank_genes_groups._build_identifier( groups_key, group_name, gene_counter, gene_name) if compute_distribution and identifier not in set( adata.obs_keys()): raise ValueError( 'You need to set `compute_distribution=True` in ' '`sc.tl.rank_genes_groups()`.') keys.append(identifier) else: keys = gene_names # make a "hue" option! df = pd.DataFrame() for key in keys: if adata.raw is not None and use_raw: X_col = adata.raw[:, key].X else: X_col = adata[:, key].X if issparse(X_col): X_col = X_col.toarray().flatten() df[key] = X_col df['hue'] = adata.obs[groups_key].astype(str).values if reference == 'rest': df['hue'][df['hue'] != group_name] = 'rest' else: df['hue'][~df['hue'].isin([group_name, reference])] = np.nan df['hue'] = df['hue'].astype('category') df_tidy = pd.melt(df, id_vars='hue', value_vars=keys) x = 'variable' y = 'value' hue_order = [group_name, reference] import seaborn as sns ax = sns.violinplot(x=x, y=y, data=df_tidy, inner=None, hue_order=hue_order, hue='hue', split=split, scale=scale, orient='vertical', ax=ax) if strip: ax = sns.stripplot(x=x, y=y, data=df_tidy, hue='hue', dodge=True, hue_order=hue_order, jitter=jitter, color='black', size=size, ax=ax) ax.set_xlabel('genes') ax.set_title('{} vs. {}'.format(group_name, reference)) ax.legend_.remove() if computed_distribution: ax.set_ylabel('z-score w.r.t. to bulk mean') else: ax.set_ylabel('expression') ax.set_xticklabels(gene_names, rotation='vertical') writekey = ('rank_genes_groups_' + str(adata.uns['rank_genes_groups']['params']['groupby']) + '_' + group_name) utils.savefig_or_show(writekey, show=show, save=save)
model_name = model.__class__.__name__ accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) for fold_idx, accuracy in enumerate(accuracies): entries.append((model_name, fold_idx, accuracy)) cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy']) #绘制箱线图 sns.boxplot(x='model_name', y='accuracy', data=cv_df) sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2) plt.show() #线性SVC模型调用 model = LinearSVC() X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split( features, labels, data_after_stop.index, test_size=0.3, stratify=labels, random_state=0) model.fit(X_train, y_train)
def window_boxplot_fepD_vs_BW(metadata, features, feat='motion_mode_paused_fraction', windows=None, save_dir=None): import seaborn as sns from matplotlib import transforms from matplotlib import pyplot as plt plot_df = metadata[['bacteria_strain','window','date_yyyymmdd']].join(features[[feat]]) if windows is not None: assert all(w in sorted(plot_df['window'].unique()) for w in windows) plot_df = plot_df[plot_df['window'].isin(windows)] else: windows = sorted(plot_df['window'].unique()) bacteria_strain_list = ['BW', 'fepD'] plt.close('all') fig, ax = plt.subplots(figsize=(max(8,len(windows)),8)) sns.boxplot(x='window', y=feat, order=windows, hue='bacteria_strain', hue_order=bacteria_strain_list, dodge=True, ax=ax, palette='tab10', showfliers=False, data=plot_df) dates = list(plot_df['date_yyyymmdd'].unique()) date_col_dict = dict(zip(dates, sns.color_palette('Greys', n_colors=len(dates)))) for date in dates: sns.stripplot(x='window', y=feat, order=windows, hue='bacteria_strain', hue_order=bacteria_strain_list, dodge=True, ax=ax, s=3, marker='D', color=sns.set_palette(palette=[date_col_dict[date]], n_colors=len(bacteria_strain_list)), data=plot_df[plot_df['date_yyyymmdd']==date]) # scale plot y-axis scale_outliers = False if scale_outliers: grouped_strain = plot_df.groupby('window') y_bar = grouped_strain[feat].median() # median is less skewed by outliers Q1, Q3 = grouped_strain[feat].quantile(0.25), grouped_strain[feat].quantile(0.75) IQR = Q3 - Q1 plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR)) # load t-test results for fepD vs BW at each window t_test_path = stats_dir / 'pairwise_ttests' / 'fepD_window_results.csv' ttest_df = pd.read_csv(t_test_path, index_col=0) pvals = ttest_df[[c for c in ttest_df if 'pvals_' in c]] # annotate p-values for ii, window in enumerate(windows): p = pvals.loc[feat, 'pvals_{}'.format(window)] text = ax.get_xticklabels()[ii] assert text.get_text() == str(window) p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.99, 0.98], lw=1.5, c='k', transform=trans) ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans) # legend and labels n_labs = len(bacteria_strain_list) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=12, frameon=False, loc=(1.01, 0.9), handletextpad=0.2) ax.set_xlabel('') ax.set_xticklabels([WINDOW_DICT_STIM_TYPE[w] for w in windows]) ax.set_ylabel(feat.replace('_',' '), fontsize=12, labelpad=10) plt.subplots_adjust(right=0.85) if save_dir is not None: save_path = Path(save_dir) / '{}_windows'.format(len(windows)) / '{}.png'.format(feat) save_path.parent.mkdir(parents=True, exist_ok=True) plt.savefig(save_path, dpi=300) else: plt.show() return
def generate_graphs(fname, fluoros, tps, conditions): """ Takes transposed data and generates all sets of dot plots and scatter plots across all conditions, fluorophores, and timepoints Extracts Data Frames from dic = get_user_dict(c_list, timepoints, fluorophores, xls, c_tp_list) df = dic[condition][cd_tp] """ n = len(tps) #Retrieve data for each condition, timepoint, and fluorophore. #Step 0: Generate scatter list of permutations of fluorophores to graph scatter_list = list(itertools.combinations(fluoros, 2)) print("Fluorophores to be plotted against each other: ") for pair in scatter_list: print(pair[0], " vs. ", pair[1], "\n") #Step 1: Load the tranposed file #Read Transposed Intermediate excel #Returns a dictionary - the keys are the sheet names, and the values are the sheets as dataframes. df_dic = pd.read_excel(fname, sheet_name=None) xls = pd.ExcelFile(fname) n = len(tps) f = len(fluoros) c = len(conditions) #Create sheet of all dataframes that need a timepoint column #parse conditions from sheetnames sheet_conds, sheet_tps, sheet_cd_tps = parse_sheetnames(xls) sheet_conds = list(set(sheet_conds)) #Step 2: Generate dictionary of all dfs grouped by condition #plot_dic = {'BM' : [bm1.df, bm2.df, ...], ...} plot_dic = {} for cond in sheet_conds: plot_dic[cond] = [] keys = list(df_dic.keys()) for cond in sheet_conds: for i in range(len(keys)): key_low = keys[i].lower() cond_low = cond.lower() if cond_low in key_low: plot_dic[cond].append(df_dic[keys[i]]) #Step 3: Remove statistical outliers and add Timepoint column to each dataframe #FILTER OUTLIERS OR NOT: finished = False while not (finished): answer = input( "Would you like to filter outliers from your plots?\nOutliers are values >2 SD from the mean. Answer Y or N: " ) if answer.upper() == "Y": for cond in plot_dic: #Get the list of sheets for that condition sheets_list = plot_dic[cond] #for each df in a condition, for sheet_df in sheets_list: #Get column names columns = list(sheet_df.columns) #For each column name for col in columns: #Filter outliers by stddev in each column mean = sheet_df[col].mean() sd = sheet_df[col].std() sheet_df = sheet_df[(np.abs(sheet_df[col] - mean) < 2 * sd)] for i in range(n): df = sheets_list[i] df['Timepoint'] = tps[i] #Remove unnamed column df.drop('Unnamed: 0', inplace=True, axis=1) finished = True break if answer.upper() == "N": for cond in plot_dic: #Get the list of sheets for that condition sheets_list = plot_dic[cond] for i in range(n): df = sheets_list[i] df['Timepoint'] = tps[i] #Remove unnamed column df.drop('Unnamed: 0', inplace=True, axis=1) finished = True break else: print("You did not type Y or N. Please reenter. \n") #Step 4: Plot Scatter plots with or without trendline #TREND LINE OR NOT: finished = False while not (finished): answer = input("Would you like a trend line? Answer Y or N: ") if answer.upper() == "Y": #lmplot == scatter plot with trendline for cond in sheet_conds: for pair in scatter_list: kwargs = {'edgecolor': "white"} g = sns.lmplot(x=pair[1], y=pair[0], hue='Timepoint', data=pd.concat(plot_dic[cond]), ci=None, scatter_kws=kwargs) plt.xlabel(pair[1] + " Intensity (AU)") plt.ylabel(pair[0] + ' Intensity (AU)') plt.xlim(0, None) plt.ylim(0, None) plt.title(cond) plt.show() finished = True break if answer.upper() == "N": #relplot == scatter plot without trendline for cond in sheet_conds: for pair in scatter_list: g = sns.relplot(x=pair[1], y=pair[0], hue='Timepoint', data=pd.concat(plot_dic[cond]), kind='scatter') plt.xlabel(pair[1] + " Intensity (AU)") plt.ylabel(pair[0] + ' Intensity (AU)') plt.xlim(0, None) plt.ylim(0, None) plt.title(cond) plt.show() finished = True break else: print("You did not type Y or N. Please reenter. \n") #Step 5: Restructure data frames for Dot Plots #Add condition column to each dataframe dotplot_df = pd.DataFrame() for cond in plot_dic: length = len(plot_dic[cond]) for i in range(length): #Get dataframe cond_df = plot_dic[cond][i] #Add Condition column cond_df['Condition'] = cond #Concatenate the df to master dotplot df dotplot_df = pd.concat([dotplot_df, cond_df]) #Step 6: Plot Dot Plots print("\nDot plots to be plotted: ") for f in fluoros: print(f"{f}\n") #ADD BOXPLOT OR VIOLINPLOT OR NOT: finished = False while not (finished): answer = input( "Would you like a box plot or violin plot overlaid on the dot plots? Answer Y or N: " ) if answer.upper() == "Y": finished2 = False while not (finished2): answer2 = input( "Please enter box for box plot, and enter violin for violin plot: " ) if answer2.lower() == "box": for f in fluoros: g = sns.boxplot(x="Condition", y=f, data=dotplot_df, hue='Timepoint') g = sns.stripplot(x='Condition', y=f, hue="Timepoint", data=dotplot_df, jitter=True, dodge=True, edgecolor='w', linewidth=0.5) plt.ylim(0, None) plt.ylabel(f + " Intensity (AU)") plt.title(f) plt.show() finished2 = True break if answer2.lower() == "violin": for f in fluoros: g = sns.violinplot(x="Condition", y=f, data=dotplot_df, hue='Timepoint') g = sns.stripplot(x='Condition', y=f, hue="Timepoint", data=dotplot_df, jitter=True, dodge=True, edgecolor='w', linewidth=0.5) plt.ylim(0, None) plt.ylabel(f + " Intensity (AU)") plt.title(f) plt.show() finished2 = True break else: print("You did not type box or violin. Please reenter. \n") finished = True break if answer.upper() == "N": for f in fluoros: g = sns.stripplot(x='Condition', y=f, hue="Timepoint", data=dotplot_df, jitter=True, dodge=True, edgecolor='w', linewidth=0.5) plt.ylim(0, None) plt.ylabel(f + " Intensity (AU)") plt.title(f) plt.show() finished = True break else: print("You did not type Y or N. Please reenter. \n") pass
def df_function(collection_df, other, ax): x_attr, hue_attr = (other, attribute) if not reverse else (attribute, other) sns.stripplot(x=x_attr, y=SCORE, hue=hue_attr, data=collection_df, order=sorted(collection_df[x_attr].unique()), jitter=0.1, dodge=True, alpha=0.5, ax=ax)
g = sns.pairplot(train[features_of_interest], hue='Survived', palette = 'seismic', diag_kind='kde', diag_kws=dict(shade=True), plot_kws=dict(s=10)) g.set(xticklabels=[]) # ## 4.2 Breakdown by Categories # The correlation is a nice start. Now let's show how survival changes with some of these categories # In[ ]: # Plot sns.set_style('white') fig = plt.figure(figsize=(12,12)) ax = sns.stripplot(x='Title', y='fare_pp', data=train, jitter=0.2, alpha=0.9, hue='Survived', split=False, palette="RdBu") # Label title = plt.title("Titles and Money", fontsize=14, fontweight='bold') title.set_position([.5, 1.03]) plt.ylabel('Fare per Person ($)', fontsize=11, fontweight='bold') plt.xlabel('Title', fontsize=11, fontweight='bold') ax.set_ylim(-1,100); # Y-Axis Ticks def dollars(x, pos): #The two args are the value and tick position return '$%1.2f' % (x) formatter = FuncFormatter(dollars) ax.yaxis.set_major_formatter(formatter)