def ca_box_plot_shopping(): # 读取数据 data1 = pd.read_csv('data/split_class/large_IGNORE_425_shopping_+1.txt', sep=' ', header=None) data2 = pd.read_csv('data/split_class/large_IGNORE_425_shopping_-1.txt', sep=' ', header=None) col1 = data1[2] / data1[1] col2 = data2[2] / data2[1] # print(col1.describe()) # print(col2.describe()) # col1.to_csv("shopping_+1.txt") # col2.to_csv("shopping_-1.txt") plt.figure(figsize=(8, 4)) sns.boxplot(data=[col1, col2], fliersize=0.1, width=0.3) # sns.violinplot(data=[col1, col2], fliersize=0.1, width=0.3) plt.xticks((0, 1), ('Extroverts', 'Introverts'), fontsize=20) # plt.xlim(0.5, 2.5) plt.yticks(fontsize=20) plt.ylabel("Purchasing Index", fontsize=20) plt.ylim(0, 0.12) # plt.boxplot(data=[col1, col2], vert=False, sym='k+', showmeans=True, showfliers=True, notch=1) # plt.yticks((1, 2), ('Extroverts', 'Introverts'), fontsize=25, rotation=30) # plt.ylim(0.5, 2.5) # # plt.xticks(fontsize=30) # plt.xlabel("Purchasing Index", fontsize=30) # plt.xlim(0, 0.12) plt.savefig('figure/purchase_box.eps', dpi=300) plt.show()
def plot_return_quantiles(returns, df_weekly, df_monthly, ax=None, **kwargs): """ Creates a box plot of daily, weekly, and monthly return distributions. Parameters ---------- returns : pd.Series Daily returns of the strategy, non-cumulative. df_weekly : pd.Series Weekly returns of the strategy, non-cumulative. df_monthly : pd.Series Monthly returns of the strategy, non-cumulative. ax : matplotlib.Axes, optional Axes upon which to plot. **kwargs, optional Passed to seaborn plotting function. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() sns.boxplot(data=[returns, df_weekly, df_monthly], ax=ax, **kwargs) ax.set_xticklabels(['daily', 'weekly', 'monthly']) ax.set_title('Return quantiles') return ax
def plot_retest_data(retest_data, size=4.6, save_dir=None): colors = [sns.color_palette('Reds_d',3)[0], sns.color_palette('Blues_d',3)[0]] f = plt.figure(figsize=(size,size*.75)) # plot boxes with sns.axes_style('white'): box_ax = f.add_axes([.15,.1,.8,.5]) sns.boxplot(x='icc3.k', y='Measure Category', ax=box_ax, data=retest_data, palette={'Survey': colors[0], 'Task': colors[1]}, saturation=1, width=.5, linewidth=size/4) box_ax.text(0, 1, '%s Task measures' % Task_N, color=colors[1], fontsize=size*2) box_ax.text(0, 1.2, '%s Survey measures' % Survey_N, color=colors[0], fontsize=size*2) box_ax.set_ylabel('Measure category', fontsize=size*2, labelpad=size) box_ax.set_xlabel('Intraclass correlation coefficient', fontsize=size*2, labelpad=size) box_ax.tick_params(labelsize=size*1.5, pad=size, length=2) [i.set_linewidth(size/5) for i in box_ax.spines.values()] # plot distributions dist_ax = f.add_axes([.15,.6,.8,.4]) dist_ax.set_xlim(*box_ax.get_xlim()) dist_ax.set_xticklabels('') dist_ax.tick_params(length=0) for i, (name, g) in enumerate(retest_data.groupby('Measure Category')): sns.kdeplot(g['icc3.k'], color=colors[i], ax=dist_ax, linewidth=size/3, shade=True, legend=False) dist_ax.set_ylim((0, dist_ax.get_ylim()[1])) dist_ax.axis('off') if save_dir: plt.savefig(save_dir, dpi=dpi, bbox_inches='tight')
def ca_box_plot_driving(): # 读取数据 # n_bins = 5000 data = pd.read_csv('data/drive_index.txt', header=None) data1 = data[data[1] == 0] data2 = data[data[1] == 1] col1 = data1[9] col2 = data2[9] col1 = col1[col1 <= 0.2] col2 = col2[col2 <= 0.2] # print(col1.describe()) # print(col2.describe()) # col1.to_csv("shopping_+1.txt") # col2.to_csv("shopping_-1.txt") plt.figure(figsize=(8, 4)) sns.boxplot(data=[col1, col2], width=0.3) # sns.violinplot(data=[col1, col2], fliersize=0.1, width=0.3) plt.xticks((0, 1), ('Extroverts', 'Introverts')) # plt.xlim(0.5, 2.5) # plt.yticks(fontsize=20) plt.ylabel("Drive Index") plt.ylim(0, 0.015) # plt.boxplot(data=[col1, col2], vert=False, sym='k+', showmeans=True, showfliers=True, notch=1) # plt.yticks((1, 2), ('Extroverts', 'Introverts'), fontsize=25, rotation=30) # plt.ylim(0.5, 2.5) # # plt.xticks(fontsize=30) # plt.xlabel("Purchasing Index", fontsize=30) # plt.xlim(0, 0.12) # plt.savefig('figure/purchase_box.eps', dpi=300) plt.show()
def plot_op(operation): """ Plots operation for all models """ df = pd.read_csv(RESULT_FOLDER + RESULT_FILE, usecols=[1, 2, 3]) print(df.columns) df.columns = ['mo', 'node', 'time'] #print df.head() ele = mo(operation) qpare = df[df.mo == ele[0]] qpare = qpare.append(df[df.mo == ele[1]]) qpare = qpare.append(df[df.mo == ele[2]]) qpare = qpare.append(df[df.mo == ele[3]]) f, ax = plt.subplots() ax.set(yscale="log") ax.set_title('Query time') sns.set_style("whitegrid") sns.boxplot(x='mo', y='time', data=qpare) ax.set_xlabel("model-operation") ax.set_ylabel("time [s]") #sns.plt.show() sns.plt.savefig(RESULT_FOLDER + operation + '.png') sns.plt.clf()
def MAF_comparison_boxplot(self): long_format_mafs = self._generate_maf_long_df() populations_to_plot = { "superpopulation": ['AFR', 'EUR', 'AMR'], "population": Dataset.used_populations(), } for population_level, long_df in long_format_mafs.items(): population_list = populations_to_plot[population_level] mask = long_df["population"].isin(population_list) long_df = long_df[mask] fig_width = 13 if population_level == "population" else 7 fig = plt.figure(figsize=(fig_width, 4)) ax = fig.add_subplot(1, 1, 1) panel_labels = long_df["panel"].unique() colors = [v for k, v in panel_colors().items() if k in panel_labels] sns.boxplot(data=long_df, x="population", y="MAF", hue="panel", ax=ax, linewidth=0.3, showcaps=False, showfliers=False, palette=sns.color_palette(colors), width=0.70) self._boxplot_aesthetics(ax) filename = "MAF_comparison__{}".format(population_level) plt.savefig(join(self.PLOTS_DIR, filename), bbox_inches="tight") plt.show()
def trust_perspectives_wrt_someone(trust_frame, wrt='targets'): """ Generates a 'matrix' of trust assessments of each nodes perspective from every other one, grouped by 'var' :param wrt: :param trust_frame: :return: """ if wrt == 'targets': base = "observer" comp = "target" perspective = "objective" elif wrt == 'observer': base = "target" comp = "observer" perspective = "subjective" groups = trust_frame.unstack(base).stack(comp).groupby(level=['var']) n_nodes = trust_frame.shape[1] f, ax = plt.subplots(len(groups), n_nodes, figsize=(16, 2 * len(groups)), sharey=True) plt.subplots_adjust(hspace=0.2, wspace=0.05, top=0.951) for i, (var, group) in enumerate(groups): for j, (jvar, jgroup) in enumerate(group.groupby(level=comp)): sns.boxplot(jgroup, ax=ax[i][j], **_boxplot_kwargs) if not i: # first plot ax[i][j].set_title(jvar) map(lambda a: a.set_xlabel(""), ax[i]) if i + 1 < len(groups): ax[i][0].set_xlabel(base.capitalize()) ax[i][0].set_ylabel("{0:.4f}".format(float(var))) f.suptitle( "Plots of Per-Node {0} Trust Values".format(perspective.capitalize()), fontsize=24) return f
def do_nb_linear(case, models, name, fun): fig, ax = plt.subplots(figsize=(16, 9)) nbs = [] pes = [] tops = [] bottoms = [] nindivs = 50 for model, n0 in models: nb = Nbs[(model, n0)] vals, ci, r2, sr2, j, ssize = \ case["Newb"][(model, n0)][(None, nindivs, 100, "SNP")] vals, ci = fun(n0, get_bname(model), nindivs, vals, ci, r2=r2, sr2=sr2, j=j) if len(vals) == 0: continue bottom, top = list(zip(*ci)) nbs.append(nb) tops.append(top) pes.append(vals) bottoms.append(bottom) # pylab.yscale('log') sns.boxplot(tops, notch=0, sym="") sns.boxplot(bottoms) ax.set_xticks(1 + np.arange(len(nbs))) ax.set_xticklabels([str(nb) for nb in nbs]) ax.set_ylim(0, max(nbs)) ax.set_ylabel("$\hat{N}_{e}$", fontsize=32) ax.set_xlabel("Target (simulated) ${N}_{b}$", fontsize=32)
def fig_boxplotcomparison(regular, phospho, sheet, column, bounds, figpath): """ Figure 2c from Aurora paper """ #%% xvals = [] yvals = [] for i, j in zip([regular[sheet], regular[column], phospho[sheet].values, phospho[column].values], ["peptides", "norm_peptides", "phospho", "norm_phospho"]): xvals.extend(i) yvals.extend([j]*len(i)) df = pd.DataFrame([xvals, yvals]).transpose() df.columns = [column, "peptide type"] #%% lower_bound = bounds[0] upper_bound = bounds[1] bmap = brewer2mpl.get_map('Paired', 'Qualitative', 6).mpl_colors f, ax = plt.subplots(1, figsize=(11.69, 8.27)) sns.boxplot(x="peptide type", y=column, data=df, palette=bmap) ax.set(xticks=[0, 1, 2, 3], xticklabels=["peptides", "peptides \n(normalized)", "phospho-\npeptides", "phosphopeptides \n(normalized)"]) ax.set(ylim=(-2, 2), ylabel="log2 (fold change)", title=column) ax.axhline(lower_bound, ls="--", lw=2, color="red", alpha=0.7) ax.axhline(upper_bound, ls="--", lw=2, color="red", alpha=0.7) ax.yaxis.set_major_locator(MaxNLocator(4)) sns.despine() cutils.save_fig(f, figpath+"162_BoxplotNorm_{}".format(column))
def plot_boxes(self, peaks): """Draw a boxplot to show the distribution of copes at peaks.""" cope_data = nib.load(self.inputs.cope_file).get_data() peak_spheres = self._peaks_to_spheres(peaks).get_data() peak_dists = np.zeros((cope_data.shape[-1], len(peaks))) for i, peak in enumerate(peaks, 1): sphere_mean = cope_data[peak_spheres == i].mean(axis=(0)) peak_dists[:, i - 1] = sphere_mean with sns.axes_style("whitegrid"): f, ax = plt.subplots(figsize=(9, float(len(peaks)) / 3 + 0.33)) try: # seaborn >= 0.6 sns.boxplot(data=peak_dists, palette="husl", orient="h", ax=ax) labels = np.arange(len(peaks)) + 1 except TypeError: # seaborn < 0.6 pal = sns.husl_palette(peak_dists.shape[1])[::-1] sns.boxplot(peak_dists[:, ::-1], color=pal, ax=ax, vert=False) labels = np.arange(len(peaks))[::-1] + 1 sns.despine(left=True, bottom=True) ax.axvline(0, c=".3", ls="--") ax.set(yticklabels=labels, ylabel="Local Maximum", xlabel="COPE Value") out_fname = op.realpath("peak_boxplot.png") self.out_files.append(out_fname) f.savefig(out_fname, bbox_inches="tight") plt.close(f)
def do_cohort(case, model, N0, nindiv, corr_name): last = 0.5 fig, ax = plt.subplots(figsize=(16, 9)) nb = Nbs[(model, N0)] #fig.suptitle("Nb: %d (N1: %d) - different cohorts - 100 SNPs -%s" % # (nb, N0, corr_name), fontsize=18) fig.suptitle("Nb: %d - different cohorts - 100 SNPs - %s" % (nb, corr_name), fontsize=24) box_vals = [] labels = [] tops = [] bottoms = [] hmeans = [] bname = get_bname(model) for cohort in cohorts: vals, ci, r2, sr2, j, ssize = \ case[cohort][(model, N0)][(None, nindiv, 100, "SNP")] for cname, corrections in get_corrs(N0, bname, nindiv, vals, ci, r2, sr2, j): if cname != corr_name: continue cvals, cci = corrections vals = cvals ci = cci break box_vals.append(vals) hmeans.append(hmean(vals)) bottom, top = list(zip(*ci)) top = [100000 if x is None else x for x in top] bottom = [100000 if x is None else x for x in bottom] tops.append(np.percentile(top, 90)) bottoms.append(np.percentile(bottom, 10)) if cohort == 'c2c': labels.append("2 cohorts") elif cohort == 'c3c': labels.append("3 cohorts") else: labels.append("%s" % cohort) if cohort == cohorts[-1]: pos = len(labels) + 0.5 ax.axvline(pos, color="k", lw=0.2) ax.text(last + (pos - last) / 2, 0, "%d Individuals sampled" % nindiv, ha="center", va="bottom", size=24, rotation="horizontal") last = pos ax.set_ylim(0, nb * 3) ax.set_ylabel('$\hat{N}_{e}$', fontsize=32) ax.axhline(nb, color="k", lw=0.3) sns.boxplot(box_vals, notch=0, sym="") ax.set_xticks(1 + np.arange(len(labels))) ax.set_xticklabels(labels, fontsize=24) ax.plot([1 + x for x in range(len(tops))], tops, "rx") ax.plot([1 + x for x in range(len(bottoms))], bottoms, "rx") ax.plot([1 + x for x in range(len(hmeans))], hmeans, "k+") yticks = [0, nb // 2, nb, 2 * nb, 3 * nb] ax.set_yticks(yticks) ax.set_yticklabels([str(y) for y in yticks], fontsize=14) #fig.savefig("output/cohort-%s-%s-%d.png" % (model, corr_name, N0)) return fig
def stratify_numtot_age(classifier, numtot_dict, class_name, class2_enter, mirna2age,finname): pd_precursor = [] ages_yes = [] ages_no = [] class_vals = flatten(classifier.values()) for val in numtot_dict: if val not in mirna2age: continue if val in class_vals: pd_precursor.append([numtot_dict[val], 'In miRNA %s' %(class_name), mirna2age[val]]) ages_yes.append(mirna2age[val]) else: pd_precursor.append([numtot_dict[val], 'Not in miRNA %s' %(class_name), mirna2age[val]]) ages_no.append(mirna2age[val]) ages_lst = list(set(ages_yes).intersection(set(ages_no))) db = pd.DataFrame(pd_precursor, columns=[class2_enter, 'miRNA Class', 'Age (MY)']) print spearmanr(db[class2_enter].tolist(), db['Age (MY)'].tolist()) sns.boxplot(x='Age (MY)', y=class2_enter, showfliers=False, data=db) sns.plt.savefig('../figures/%s.pdf' %(finname),bbox_inches='tight') sns.plt.close()
def plot_return_quantiles(returns, df_weekly, df_monthly, ax=None, **kwargs): """Creates a box plot of daily, weekly, and monthly return distributions. Parameters ---------- returns : pd.Series Daily returns of the strategy, noncumulative. - See full explanation in tears.create_full_tear_sheet. df_weekly : pd.Series Weekly returns of the strategy, noncumulative. - See timeseries.aggregate_returns. df_monthly : pd.Series Monthly returns of the strategy, noncumulative. - See timeseries.aggregate_returns. ax : matplotlib.Axes, optional Axes upon which to plot. **kwargs, optional Passed to seaborn plotting function. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() sns.boxplot(data=[returns, df_weekly, df_monthly], ax=ax, **kwargs) ax.set_xticklabels(["daily", "weekly", "monthly"]) ax.set_title("Return quantiles") return ax
def posterior_predictive_bin_fracs(post_bin_counts, bin_counts): # compare with mean and variance in category fractions total_counts = bin_counts.sum(axis=1) bin_fracs = bin_counts.apply(lambda x: x / total_counts, axis=0) mean_bin_frac = bin_fracs.mean(axis=0) std_bin_frac = bin_fracs.std(axis=0) # get expected mean and expected variance from MCMC samples post_bin_fracs = post_bin_counts.apply(lambda x: x / post_bin_counts.sum(axis=1), axis=0) post_bin_mean = post_bin_fracs.mean(axis=0, level=1) post_bin_std = post_bin_fracs.std(axis=0, level=1) fig = plt.figure() ax1 = plt.subplot(211) sns.boxplot(post_bin_mean, ax=ax1) plt.plot(1 + np.arange(len(mean_bin_frac)), mean_bin_frac, 'ko') ax1.set_ylabel('Mean over data') ax2 = plt.subplot(212) sns.boxplot(post_bin_std, ax=ax2) ax2.plot(1 + np.arange(len(std_bin_frac)), std_bin_frac, 'ko') ax2.set_ylabel('Std over data') ax2.set_xlabel('Bin ID') plt.tight_layout() return ax1, ax2
def plotResults(tr, resultKey='resultInputPsf', doRates=False, title='', asHist=False, doPrint=True, actuallyPlot=True): import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') import seaborn as sns sns.set(style="whitegrid", palette="pastel", color_codes=True) methods = ['ALstack', 'ZOGY', 'SZOGY', 'ALstack_decorr'] tr = [t for t in tr if t is not None and t[resultKey]] FN = pd.DataFrame({key: np.array([t[resultKey][key]['FN'] for t in tr]) for key in methods}) FP = pd.DataFrame({key: np.array([t[resultKey][key]['FP'] for t in tr]) for key in methods}) TP = pd.DataFrame({key: np.array([t[resultKey][key]['TP'] for t in tr]) for key in methods}) title_suffix = 's' if doRates: FN /= (FN + TP) FP /= (FN + TP) TP /= (FN + TP) title_suffix = ' rate' if doPrint: print 'FN:', '\n', FN.mean() print 'FP:', '\n', FP.mean() print 'TP:', '\n', TP.mean() if not actuallyPlot: return TP, FP, FN matplotlib.rcParams['figure.figsize'] = (18.0, 6.0) fig, axes = plt.subplots(nrows=1, ncols=2) if not asHist: sns.violinplot(data=TP, cut=True, linewidth=0.3, bw=0.25, scale='width', alpha=0.5, ax=axes[0]) if TP.shape[0] < 500: sns.swarmplot(data=TP, color='black', size=3, alpha=0.3, ax=axes[0]) sns.boxplot(data=TP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[0]) plt.setp(axes[0], alpha=0.3) axes[0].set_ylabel('True positive' + title_suffix) axes[0].set_title(title) sns.violinplot(data=FP, cut=True, linewidth=0.3, bw=0.5, scale='width', ax=axes[1]) if FP.shape[0] < 500: sns.swarmplot(data=FP, color='black', size=3, alpha=0.3, ax=axes[1]) sns.boxplot(data=FP, saturation=0.5, boxprops={'facecolor': 'None'}, whiskerprops={'linewidth': 0}, showfliers=False, ax=axes[1]) plt.setp(axes[1], alpha=0.3) axes[1].set_ylabel('False positive' + title_suffix) axes[1].set_title(title) else: for t in TP: sns.distplot(TP[t], label=t, norm_hist=False, ax=axes[0]) axes[0].set_xlabel('True positive' + title_suffix) axes[0].set_title(title) legend = axes[0].legend(loc='upper left', shadow=True) for t in FP: sns.distplot(FP[t], label=t, norm_hist=False, ax=axes[1]) axes[1].set_xlabel('False positive' + title_suffix) axes[1].set_title(title) legend = axes[1].legend(loc='upper left', shadow=True) return TP, FP, FN
def compare_seq_counts_among_param_sets(out_dir, qual_vals, length_vals, ylim=None): param_sets = product(qual_vals, length_vals) out_dir = out_dir.rstrip('/')+'/' data = [] for param_set in param_sets: workdir = out_dir+'minqual%i_minlength%i/' % param_set data += [l+['minqual%i_minlength%i' % param_set] for l in sequence_counts(workdir)] headers = ['smpl', 'data type', 'count', 'param set'] df = pd.DataFrame(data, columns=headers) #print df sns.set(style="ticks") # Draw a nested boxplot to show bills by day and sex sns.boxplot(x="data type", y="count", hue="param set", data=df, palette="PRGn") #sns.despine(offset=10, trim=True) plt.xticks(rotation=30) if ylim: plt.ylim(ylim)
def stripplot_mean_score(df, save_path, atlas=None, suffix=None, x=None, y=None, hue=None, style='whitegrid', fontsize=14, jitter=.2, figsize=(9, 3), leg_pos=2, axx=None): def change_label_name(row, label): row[label] = new_names[row[label]] return row ylabel = atlas aliases = {'kmeans': 'K-Means', 'ica': 'GroupICA', 'dictlearn': 'Dictionary Learning', 'basc': 'BASC'} if atlas == 'kmeans': new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) else: new_names = {'no': 'Without\n regions extracted', 'yes': 'With\n regions extracted'} df = df.apply(lambda x: change_label_name(x, y), axis=1) # change the name of the dataset to upper df['dataset'] = df['dataset'].str.upper() # make labels of the y axes shorter # df[y] = df[y].str.wrap(13) rc('xtick', labelsize=12) rc('ytick', labelsize=16) rc('axes', labelweight='bold') # string.capitalize rc('legend', fontsize=fontsize) n_data = len(df['dataset'].unique()) palette = color_palette(n_data) # draw a default vline at x=0 that spans the yrange axx.axvline(x=0, linewidth=4, zorder=0, color='0.6') sns.boxplot(data=df, x=x, y=y, fliersize=0, linewidth=2, boxprops={'facecolor': '0.5', 'edgecolor': '.0'}, width=0.5, ax=axx) sns.stripplot(data=df, x=x, y=y, hue=hue, edgecolor='gray', size=5, split=True, palette=datasets_palette, jitter=jitter, ax=axx) axx.set_xlabel('') # axx.set_ylabel(aliases[ylabel], fontsize=15) axx.set_ylabel('') plt.text(.5, 1.02, aliases[key], transform=ax.transAxes, size=15, ha='center') # make the positive labels with "+" axx_xticklabels = [] for x in axx.get_xticks(): if x > 0: axx_xticklabels.append('+' + str(x) + '$\%$') else: axx_xticklabels.append(str(x) + '$\%$') axx.set_xticklabels(axx_xticklabels)
def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response): feature = X.columns f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False) sns.regplot(X[feature[4]], y, test_data, ax=ax1) sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2) model.fit(X_train, y_train) sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3) if model_name is 'linear': sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5}) elif model_name is 'logistic': pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True) levels = np.linspace(0, 1, 11) sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True) else: pass ax1.set_title('Regression') ax2.set_title(feature[4]+' Value') ax3.set_title(feature[4]+' Residuals') ax4.set_title('Two-value Interaction') f.tight_layout() plt.savefig(model_name+'_'+feature[4], bbox_inches='tight') # Multi-variable correlation significance level f, ax = plt.subplots(figsize=(10, 10)) cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap) ax.grid(False) ax.set_title('Multi-variable correlation significance level') plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight') # complete coefficient plot - believe this is only for linear regression sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True) plt.xticks(rotation='vertical') plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
def run(self): # get data df_train = pd.read_csv(self.input()[0].path, header=[0, 1]) df_test = pd.read_csv(self.input()[1].path, header=[0, 1]) df_train_no_covariance_shift = pd.read_csv(self.input()[2].path, header=[0, 1]) evaluation_setups = [EvaluationStruct("Proposed", rf)] # evaluate the different methods df_adapted = evaluate_data(df_train, noise_levels, df_test, noise_levels, evaluation_setups=evaluation_setups) df_adapted["data"] = "adapted" df_no_adaptation = evaluate_data( df_train.drop("weights", axis=1), noise_levels, df_test, noise_levels, evaluation_setups=evaluation_setups) df_no_adaptation["data"] = "source" df_no_covariance_shift = evaluate_data( df_train_no_covariance_shift, noise_levels, df_test, noise_levels, evaluation_setups=evaluation_setups) df_no_covariance_shift["data"] = "target" df = pd.concat([df_adapted, df_no_adaptation, df_no_covariance_shift]) # plot it sns.boxplot(data=df, x="noise added [sigma %]", y="Errors", hue="data", hue_order=["source", "adapted", "target"], fliersize=0) # tidy up plot plt.ylim((0, 40)) plt.legend(loc='upper left') # finally save the figure plt.savefig(self.output().path, dpi=500)
def plot_perf_stats(returns, factor_returns, ax=None): """Create box plot of some performance metrics of the strategy. The width of the box whiskers is determined by a bootstrap. Parameters ---------- returns : pd.Series Daily returns of the strategy, noncumulative. - See full explanation in tears.create_full_tear_sheet. factor_returns : pd.DataFrame, optional data set containing the Fama-French risk factors. See utils.load_portfolio_risk_factors. ax : matplotlib.Axes, optional Axes upon which to plot. Returns ------- ax : matplotlib.Axes The axes that were plotted on. """ if ax is None: ax = plt.gca() bootstrap_values = timeseries.perf_stats_bootstrap(returns, factor_returns, return_stats=False) bootstrap_values = bootstrap_values.drop('kurtosis', axis='columns') sns.boxplot(bootstrap_values, orient='h', ax=ax) return ax
def main(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", type=str, required=True, help="Metadata table") # parser.add_argument("-v", "--vars", type=str, # help="Variables to use for definitions. Currently only includes combinations of 'Date','Chla',\ # 'Temperature','Phosphate'. Comma separated. Defaults to 'Chla,Temperature,Phosphate'") parser.add_argument("-r", "--rangedef", type=str, help="Range definitions. Not implemented yet.") parser.add_argument("-p", "--plot", action="store_true", help="Produce boxplots of each period") args = parser.parse_args() ## Read metadata meta = pd.read_csv(args.metadata, header=0, index_col=0, sep="\t") meta.rename(columns=lambda x: x.rstrip(), inplace=True) ## Add Julian day column meta = addJulDay(meta) ## Match ranges meta_m = match_ranges(ranges,meta,keys=["Chla","Temperature","Phosphate"]) ## Write definitions meta_m["Period"].to_csv(sys.stdout, sep="\t") if args.plot: for v in plotvars: sns.boxplot(data=meta_m,x="Period",y=v,order=order) plt.savefig(v+".pdf",bbox_inches="tight") plt.close()
def make_plots(groups): sns.stripplot("ammo", "moa", data=groups, jitter=True) postprocess() plt.savefig("points.png") plt.clf() sns.boxplot("ammo", "moa", data=groups) postprocess() plt.savefig("boxplot.png") plt.clf() sns.barplot("ammo", "mean", data=groups, ci=None) plt.title("mean moa for best 9 of 10 five shot groups") plt.ylabel("moa") postprocess() plt.savefig("avg_moa.png") plt.clf() std = groups["standard"] std = std[std.notnull()] fig, axes = plt.subplots(ncols=2) sns.distplot(std, ax=axes[0]) stats.probplot(std, plot=axes[1]) fig.set_size_inches(6, 4) fig.tight_layout() plt.savefig("qqplot.png")
def _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax, cmap, n_cat=5, plottype="box"): """ Plot a categorical variable and a continuous variable against each other. Types of plots include box plot, violin plot, strip plot and swarm plot. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis ylabel : str The column name for the variable on the y-axis ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles n_cat : int The number of categories; used for creating the colour map plottype : {"box" | "violin" | "strip" | "swarm"} The type of plot to produce; default is a box plot Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ if x_keys is xlabel: keys = y_keys elif y_keys is ylabel: keys = x_keys else: raise Exception("Something went terribly, horribly wrong!") current_palette = sns.color_palette(cmap, n_cat) if plottype == "box": sns.boxplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "strip": sns.stripplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "swarm": sns.swarmplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) elif plottype == "violin": sns.violinplot(x=xlabel, y=ylabel, data=df, order=keys, palette=current_palette, ax=ax) else: raise Exception("plottype not recognized!") return ax
def hist_boxplot(data, obs = None): """ Plot histograms and boxplots of supplied pandas.DataFrame of data. :param data: The data to be plotted, typically forecast and reference distributions. :type data: pandas.DataFrame :param obs: Observed value to plot as vertical line on histogram and boxplot subplots if not None. Defaults to None. :type obs: float :returns: (matplotlib.figure.Figure, matplotlib.axes.Axes) """ fig, axes = plt.subplots(nrows=2, ncols=1) data.plot(kind='hist', bins=25, alpha=0.6, ax=axes[0]) sns.boxplot(data = data, orient='h', ax = axes[1]) if obs is not None: for ax in axes: obs_line = ax.vlines(obs, *ax.get_ylim(), linestyle='dashed') obs_line.set_label('Observed') ax.legend() fig.tight_layout() return fig, axes
def plot_entropies(results, rotate='oblimin', dpi=300, figsize=(20,8), ext='png', plot_dir=None): """ Plots factor analytic results as bars Args: results: a dimensional structure results object c: the number of components to use task_sublists: a dictionary whose values are sets of tasks, and whose keywords are labels for those lists dpi: the final dpi for the image figsize: scalar - the width of the plot. The height is determined by the number of factors ext: the extension for the saved figure plot_dir: the directory to save the figure. If none, do not save """ EFA = results.EFA # plot entropies entropies = EFA.results['entropies_%s' % rotate].copy() null_entropies = EFA.results['null_entropies_%s' % rotate].copy() entropies.loc[:, 'group'] = 'real' null_entropies.loc[:, 'group'] = 'null' plot_entropies = pd.concat([entropies, null_entropies], 0) plot_entropies = plot_entropies.melt(id_vars= 'group', var_name = 'EFA', value_name = 'entropy') with sns.plotting_context('notebook', font_scale=1.8): f = plt.figure(figsize=figsize) sns.boxplot(x='EFA', y='entropy', data=plot_entropies, hue='group') plt.xlabel('# Factors') plt.ylabel('Entropy') plt.title('Distribution of Measure Specificity across Factor Solutions') if plot_dir is not None: f.savefig(path.join(plot_dir, 'entropies_across_factors.%s' % ext), bbox_inches='tight', dpi=dpi) plt.close()
def boxplotify(df, feature, path, title, save=True): fig, ax = plt.subplots(figsize=(12, 5)) fig.suptitle(title, fontsize=20) boxplot( x=df['decade'], y=df[feature], hue=df['charted'], linewidth=2, ax=ax, palette={0: 'r', 1: 'g'} ) yes = Patch(color='g', label='Yes') no = Patch(color='r', label='No') plt.legend( bbox_to_anchor=(1, 1), loc=2, ncol=1, shadow=True, title="Charted", handles=[yes, no] ) if save: fig.savefig(staticDir.format(file=path)) plt.show()
def caixas(exames=["BAC", "RBC", "MUC", "CAOXD", "HYA", "PAT", "WBC", "EPI", "TRI", "URI", "YEA", "AMO"]): for i in exames: plt.clf() plt.close() filename = "boxplot" + i + ".png" sns.boxplot(x="HORA", y="MEDIDA", hue="LOCAL", data=dfcontmelt[dfcontmelt.EXAME == i], palette="Blues", sym="") plt.savefig(filename)
def plot_times(config, segment_id, distribution): """ Generates a plot to visualize the performance of the current athlete at a specific segment in comparison to other athletes. :param config: Config object providing API access via security token :param segment_id: ID of the strava segment in question :param distribution: Whether to plot the time distribution over efforts instead of a boxplot :return: """ client = config.client ridden_segs = read_data(DATAFILE) all_efforts = client.get_segment_efforts(segment_id) X = [e.elapsed_time for e in all_efforts] X = np.array([datetime.timedelta.total_seconds(x) for x in X]) Y = np.array([x for x in ridden_segs[segment_id].times]) if distribution: plt.xlabel('Time in seconds') sns.distplot(X, hist=False, rug=True) sns.distplot(Y, hist=False, rug=True) plt.show() else: plt.ylabel('Time in seconds') data = np.array([X, Y]) sns.boxplot(data=data, orient='v') plt.show()
def main(): args = parse_args() run_sizes_path: Path = args.run_sizes_csv if not run_sizes_path.exists(): with run_sizes_path.open('w') as run_sizes_csv: scan_run_folders(args.runs, run_sizes_csv, args.group_size) runs = pd.read_csv(run_sizes_path) runs['version'] = runs['version'].str.replace('version_', 'v') runs['version'] = runs['version'].str.replace('-UPDATED-BOWTIE', '') runs['version'] = runs['version'].str.replace('RC1', 'r1') runs.sort_values('version', inplace=True) plain_runs = runs.copy() plain_runs['size'] = plain_runs['outputs'] plain_runs['type'] = 'unzipped' zipped_runs = runs.copy() zipped_runs['size'] = zipped_runs['zipped'] zipped_runs['type'] = 'zipped' all_runs = pd.concat([plain_runs, zipped_runs]) sns.boxplot(x='size', y='version', hue='type', data=all_runs) plt.xlabel('Output size (MB)') plt.title('MiSeq Disk Usage') plt.show()
def plotboxplots(cufflinks_t, target, out): """Function accepts tab delimited FPKM table generated via Cufflinks and generates a Seaborn box plot identifying up and downregulated genes as well as marking where the gene of interest falls among fold change distributions Args: cufflinks_t (str/path): Tab delimited FPKM table. target (str): Gene of interest to compare fold change distributions to out (str): Title for saved .eps image Returns: Nothing. Saves .eps boxplot image as out.eps """ fc_filt, samples = returnfilterfc(cufflinks_t) dmpk = array(samples['Dmpk']).astype(float) fc = log10(dmpk[-2:].mean()) - log10(dmpk[:3].mean()) neg_filt = fc_filt[fc_filt <= -1] pos_filt = fc_filt[fc_filt >= 1] fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True) if fc >= 0: symb = ax2 else: symb = ax1 ax1.set_xlabel('Negative Fold change') ax2.set_xlabel('Positive Fold change') sns.boxplot(neg_filt, showfliers=False, color='Orange', notch=True, orient='v', ax=ax1) sns.boxplot(pos_filt, showfliers=False, color='Grey', notch=True, orient='v', ax=ax2) symb.axhline(y=fc, linewidth=.8, color='red', linestyle='dashed') red_patch = mpatches.Patch(color='red', label='DMPK \nFold change = %s' % (round(fc, 2))) plt.legend(handles=[red_patch], loc=(.09, .01)) plt.suptitle('Fold Change Distributions') plt.savefig(out, format='eps', dpi=1000)
matplotlib.rc('font', **font) # import mpld3 # mpld3.enable_notebook() sns.set_style('white') df = pd.read_csv('Predictions.csv') df['Average'] = df['Features'].apply( lambda f: df[df['Features'] == f]['MSE'].mean()) df = df.sort_values('Average') unique_df = df.drop_duplicates(['Features']) plt.subplots(figsize=(12, 8)) chart = sns.boxplot(x='Features', y='MSE', data=df, linewidth=1.0, fliersize=2) # , inner=None # dup_df = df.drop_duplicates(subset=['Features'], keep='first', inplace=False) # chart = sns.swarmplot(x='Features', y='MSE', data=df, hue='Correlation', linewidth=1.0, palette='Reds') palette = matplotlib.cm.get_cmap('BrBG') min_val = -1.0 #unique_df['Correlation'].min() max_val = 1.0 #unique_df['Correlation'].max() # print(min_val) # print(max_val) for i, box in enumerate(chart.artists): corr = unique_df.iloc[i]['Correlation'] box.set_facecolor(palette((corr - min_val) / (max_val - min_val))) #if corr < 0.0: # box.set_facecolor(palette((-corr / (2 * min_val) + 0.5))) #else: # box.set_facecolor(palette((corr / (2 * max_val) + 0.5)))
def result_plots(df): plot_width = 255*2 box_plot_len = 170*2 font_size = 12 # CRF over BLEL plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72)) plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5) sn.boxplot(x = 'BLEL', y = 'CRF', linewidth = 0.5, color = 'aliceblue', flierprops={'markersize':2,'markeredgecolor': 'black'}, showmeans=True, meanprops={'marker':"x",'markeredgecolor': 'black'}, whis = (5,95), data = df[df['delta_t'] == 30], ) plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size) plt.yticks(fontsize = font_size) plt.xlabel('Bus-line electrification level in %', fontsize = font_size) plt.ylabel('CRF in %', fontsize = font_size) plt.yticks([0,2,4,6,8,10,12]) # CRF over BLEL comparison between two peak averaging durations plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72)) plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5) Comparison = sn.boxplot(x = 'BLEL', y = 'CRF', hue = 'delta_t', hue_order = [30,15], linewidth = 0.5, color = 'skyblue', flierprops={'markersize':2,'markeredgecolor': 'black'}, showmeans=True, meanprops={'marker':"x",'markeredgecolor': 'black'}, whis = (5,95), data = df, ) plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size) plt.yticks(fontsize = font_size) plt.xlabel('Bus-line electrification level in %', fontsize = font_size) plt.ylabel('CRF in %', fontsize = font_size) plt.yticks([0,2,4,6,8,10,12,14,16]) Comparison.legend_.set_title('') handles, _ = Comparison.get_legend_handles_labels() Comparison.legend(handles,['Δt = 30 minutes', 'Δt = 15 minutes'],fontsize = font_size) # Scatter plot (CRF over number of chargers) plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72)) plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5) plt.scatter(x='n_chargers', y='CRF', s = 12, marker="o", edgecolors='black', c = 'aliceblue', data = df[df['delta_t'] == 30], ) plt.xlabel('Number of chargers', fontsize = font_size) plt.ylabel('CRF in %', fontsize = font_size) plt.xticks([0,5,10,15,20,25,30], fontsize = font_size) plt.yticks([0,2,4,6,8,10,12]) plt.yticks(fontsize = font_size) # Share of demand charge over BLEL df_modified = pd.melt(df, id_vars=['BLEL'], value_vars=['share_of_demand', 'share_of_demand_z']) df_modified.loc[df_modified['variable']=='share_of_demand','variable'] = 'with SES' df_modified.loc[df_modified['variable']=='share_of_demand_z','variable'] = 'without SES' plt.figure(constrained_layout=True, figsize=(plot_width/72,box_plot_len/72)) plt.grid(axis='y',color='gray', linestyle='-', linewidth=0.5) sn.boxplot(x='BLEL', y='value', hue='variable', hue_order=['without SES','with SES'], linewidth = 0.5, flierprops={'markersize':2,'markeredgecolor': 'gray', 'linewidth':0.2}, showmeans=True, meanprops={'marker':"x",'markeredgecolor': 'black'}, whis = (5,95), data = df_modified, ) plt.xticks([0,1,2,3,4,5],['30', '50', '70', '80', '90', '100'], fontsize = font_size) plt.yticks(fontsize = font_size) plt.xlabel('Bus-line electrification level in %', fontsize = font_size) plt.ylabel(r'$C_{demand}/C^*_{tot}$', fontsize = font_size) plt.ylim(0,0.5) plt.legend(ncol=1, fontsize = font_size)
axes[0].set_xticklabels(xlabels,rotation=45, horizontalalignment='right') axes[0].tick_params(labelsize=8) ############################ plot TS start times ################## ts_stat = get_gpats_start_end_duration( get_gpats_data(cur_dir,sta=sta,res='1min') ) print(ts_stat.info()) print(ts_stat) # round duration to closest 1hour !! ts_stat['duration'] = round((ts_stat['last'] - ts_stat['first'])/np.timedelta64(1, 'h') , 1) # convert start and end time to numeric ts_stat['first'] = ts_stat['first'].dt.strftime('%H:%M').apply(conversion) ts_stat['last'] = ts_stat['last'].dt.strftime('%H:%M').apply(conversion) sns.boxplot(data=ts_stat, x=ts_stat.index.month, y='first', linewidth=2, ax=axes[1]) axes[1].set_ylabel('Onset (UTC)', color='g', fontsize=15) xlabels=[dict_mon[x+1] for x in axes[1].get_xticks()] axes[1].set_xticklabels(xlabels,rotation=45, horizontalalignment='right') axes[1].tick_params(labelsize=8) axes[1].set_xlabel('', color='g', fontsize=15) ############################ plot TS end times ################## sns.boxplot(data=ts_stat, x=ts_stat.index.month, y='last', linewidth=2, ax=axes[2]) axes[2].set_ylabel('Finish (UTC)', color='g', fontsize=15) xlabels=[dict_mon[x+1] for x in axes[2].get_xticks()] axes[2].set_xticklabels(xlabels,rotation=45, horizontalalignment='right')
# EDA data["Status"].value_counts() data["Country"].value_counts() plt.figure(figsize = (10, 8)) data.boxplot('Life expectancy ') plt.show() plt.figure(figsize = (10, 8)) sns.boxplot("Status", 'Life expectancy ', data = data) # shows that life expectancy is higher in developed countries. plt.xlabel("Status", fontsize = 16) plt.ylabel("Total expenditure", fontsize = 16) plt.show() data_corr = data[["Life expectancy ", "Adult Mortality", "Schooling", "Total expenditure", "Diphtheria ", "GDP", "Population"]].corr() data_corr
### 3. HANDLING OUTLIERS, EXTREME VALUES & SKEWNESS############################ df.shape #rows: 10296 df['1stPolYear'].describe() #Drop values >2016, as the database comes from 2016 df = df.drop(df[df['1stPolYear'] > 2016].index) sns.kdeplot(df['1stPolYear']).set_title('1st Policy Year') df['BirthYear'].describe() #Drop values <1900 df = df.drop(df[df['BirthYear'] < 1900].index) df['BirthYear'].hist(bins=50).set_title('Birth Year') df['GrossMthSalary'].describe() sns.boxplot(x=df['GrossMthSalary']) #Drop Salary>30000 df = df.drop(df[df['GrossMthSalary'] > 30000].index) df['GrossMthSalary'].hist(bins=50).set_title('Gross Monthly Salary') #Drop CustMonetVal< -2000 df['CustMonetVal'].describe() sns.boxplot(x=df['CustMonetVal']) df = df.drop(df[df['CustMonetVal'] < -2000].index) #Drop ClaimRate > 3 df['ClaimRate'].describe() sns.boxplot(x=df['ClaimRate']) df = df.drop(df[df['ClaimRate'] > 3].index) df['PremLOBMotor'].describe()
paralist = [p for p in Hist_df.columns if p != 'label'] # paralist = ['total_distance','avg_moving_speed01' , 'linearity'] #testlist # paralist = [p for p in Hist_df.columns if p.startswith('avg_moving_speed')] #testlist print(str(len(paralist)) + ' boxplots will be created\nfinished boxplots:', end=' ') # create graphs for each parameter for i, para in enumerate(paralist): plt.figure(para) # ignore 0 values in avg_moving_speed if para.startswith('avg_moving_speed'): Hist_df[para] = [x if x > 0 else np.nan for x in Hist_df[para]] # generate boxplot with overlying datapoints ax = sns.boxplot(x='label', y=para, data=Hist_df, showfliers=False) ax = sns.swarmplot(x='label', y=para, data=Hist_df, color='black', alpha=0.5) # graph formatting ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1] * 1.075) ax.xaxis.label.set_visible(False) # perform comparitive statistics on each plot and state them above plot Anov_F, Anov_p = anova(*[list(g[para]) for g in samples]) kw_H, kw_p = kwtest(*[list(g[para]) for g in samples]) plt.title( f'ANOVA p={round(Anov_p,p_dec)}; Kruskal-Wallis p={round(kw_p,p_dec)}' )
ve.append(k) amostra_paci_2['Subject_ID']=base_ids2['Subject_ID'] amostra_paci_2['fl_severidade']=base_ids2['fl_severidade'] df_vetor={'score':ve} df_vetor_final=pd.DataFrame(df_vetor,columns=['score']) print(df_vetor_final) print() print(base_unificada5_filtrada) print() amostra_paci_2['score']=df_vetor_final['score'] amostra_paci_2=amostra_paci_2.sort_values('score',ascending=False) amostra_paci_2.to_csv("base_scorada_amostra.csv") import seaborn as sns sns.boxplot(x=amostra_paci_2['score']) plt.savefig('boxplot_random.png') plt.plot(amostra_paci_2['score'],'*') plt.savefig('distribuicao_random.png') ################### GRADIENT BOOSTING ############################################ ######## from sklearn.ensemble import GradientBoostingClassifier tuned_parameters1= { "loss":["deviance","exponential"], "learning_rate": [0.05,0.075], "min_samples_split": np.linspace(0.1, 0.5, 6), "min_samples_leaf": np.linspace(0.1, 0.5, 6), "max_depth":[4,5,6], "max_features":["log2","sqrt"],
# list of categorical cols categorical_cols = list(df.select_dtypes(include=['object'])) categorical_cols print("\n>> Dtypes:\n{}".format(df.dtypes)) df.describe() print(df["Y"].value_counts()) # Numerical data analysis plt.figure(figsize=(10,8)) sns.distplot(df["nr.employed"]) get_ipython().run_line_magic('matplotlib', 'inline') sns.boxplot(data=df, x="Y", y="nr.employed") plt.show() plt.figure(figsize=(10,8)) sns.distplot(df["euribor3m"]) get_ipython().run_line_magic('matplotlib', 'inline') sns.boxplot(data=df, x="Y", y="euribor3m") plt.show() plt.figure(figsize=(10,8)) sns.distplot(df["cons.conf.idx"]) get_ipython().run_line_magic('matplotlib', 'inline') sns.boxplot(data=df, x="Y", y="cons.conf.idx") plt.show()
# The percentage of data retained from the initial dataset len(lead_df) / initial[0] * 100 # - We have 70.88% of rows which is quite enough for analysis # ### Data Visualization # #### Univariate Analysis # In[30]: # Plotting the numerical variables plt.figure(figsize=(14, 10)) plt.subplot(2, 3, 1) sns.boxplot(lead_df['Total Time Spent on Website']) plt.subplot(2, 3, 2) sns.boxplot(lead_df['TotalVisits']) plt.subplot(2, 3, 3) sns.boxplot(lead_df['Page Views Per Visit']) plt.show() # The columns `TotalVisits` and `Page Views Per Visit` have outliers in it and needs to be treated # **Handling the Outliers** # In[31]: # Capping the outliers to its 99th quantile value in Total Visits column quant = lead_df['TotalVisits'].quantile([0.99]) lead_df['TotalVisits'] = np.clip(
# -*- coding: utf-8 -*- """ Created on Sat Dec 29 16:24:29 2018 @author: Raktim Mondol """ import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns sns.set(style="whitegrid") data = pd.read_csv('./figure_data/log_loss_data.csv') ax = sns.boxplot(x="Metrics", y="Score", hue="Method", data=data, palette="Set2", linewidth=2) fig = ax.get_figure() fig.savefig("./saved_figures/box_plot_log_loss.png", dpi=300)
cols = ['temp', 'atemp', 'windspeed', 'humidity'] pp = sns.pairplot(df[cols], diag_kws=dict(shade=True), diag_kind="kde", kind="reg") fig = pp.fig fig.subplots_adjust(top=0.93, wspace=0.3) fig.suptitle('Correlação das variáveis numéricas', fontsize=14, fontweight='bold') # In[80]: sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'count']], orient='h') fig = plt.gcf() fig.set_size_inches(12, 6) fig.suptitle('Análise de Outliers', fontsize=14, fontweight='bold') # In[81]: fig, (ax1, ax2, ax3) = plt.subplots(nrows=3) fig.set_size_inches(12, 18) sns.factorplot(x="month", y="count", data=df, kind='bar', size=5, aspect=1.5,
def explore_integrity(interpolated_acti, path): """ Explore integrity of data after linear interpolation across trials Arguments: interpolated_acti {array} -- activation tensor corrected via linear interpolation Keyword Arguments: None Returns: None """ N, T, K = interpolated_acti.shape nb_flag_roi = [] nb_flag_trial = [] for tol in range(50): max_dist = np.nanmax(interpolated_acti, axis = 1) max_mask = max_dist > tol flag_roi = [] flag_trial = [] big_flag = max_mask * max_dist for roi in range(N): for trial in range(K): if max_mask[roi, trial]: flag_roi.append(roi) flag_trial.append(trial) flag_roi = list(set(flag_roi)) flag_trial = list(set(flag_trial)) nb_flag_roi.append(len(flag_roi)) nb_flag_trial.append(len(flag_trial)) fig = plt.figure(figsize=(10,5)) fig.add_subplot(1,2,1) plt.plot(np.arange(50), nb_flag_roi) plt.title('ROI lost') plt.xlabel('Threshold') plt.ylabel('Number of ROI') fig.add_subplot(1,2,2) plt.plot(np.arange(50), nb_flag_trial) plt.title('Trials lost') plt.xlabel('Threshold') plt.ylabel('Number of trials') plt.savefig(os.path.join(path, 'explore_integrity1.png')) fig = plt.figure(figsize = (25,15)) fig.add_subplot(2,1,1) sns.boxplot(x = np.arange(K), y = [max_dist[:,i] for i in range(K)]) plt.xticks(np.arange(K)[::40], np.arange(K)[::40], rotation = 'horizontal') plt.xlabel('Trials', {'fontsize': 'large', 'fontweight' : 'roman'}) plt.ylabel('Distribution of maxima across ROI', {'fontsize': 'large', 'fontweight' : 'roman'}) fig.add_subplot(2,1,2) plt.imshow(big_flag, cmap = 'hot') plt.xticks(np.arange(K)[::40], np.arange(K)[::40], rotation = 'horizontal') plt.xlabel('Trials', {'fontsize': 'large', 'fontweight' : 'roman'}) plt.ylabel('ROI', {'fontsize': 'large', 'fontweight' : 'roman'}) plt.savefig(os.path.join(path, 'explore_integrity2.png'))
palb_risk=mer[mer['IndivID'].isin(map(str,list(palb_ind)))] chek_risk=mer[mer['IndivID'].isin(map(str,list(chek_ind)))] atm_risk=mer[mer['IndivID'].isin(map(str,list(atm_ind)))] palb_risk['Version'] = palb_risk['Version'].str.replace('v4beta14','PALB2') chek_risk['Version'] = chek_risk['Version'].str.replace('v4beta14','CHEK2') atm_risk['Version'] = atm_risk['Version'].str.replace('v4beta14','ATM') mer_ver=pd.concat([chek_risk,atm_risk,palb_risk], axis=0) mer_ver['Age']=mer_ver['Age'].astype('int64') mer_ver['age_range']=pd.cut(mer_ver['Age'],bins=[20,30,40,50,60,70,80]) mer_ver['BrCaRisk%']=mer_ver['BrCaRisk%'].astype('float') mer_ver=mer_ver.rename({'Version':'Genes'}, axis='columns') bx = sns.boxplot(x="age_range", y="ratio", hue="Genes", data=mer_ver, palette='colorblind', sym='').set_title('Risk Ratio') plt.savefig('risk_ratio.png',dpi=500) out_data_v4['BrCaRisk%']=out_data_v4['BrCaRisk%'].astype('float') out_data_v4['Age']=out_data_v4['Age'].astype('int64') top_ext_v4=out_data_v4.loc[(out_data_v4['Age']==80)& (out_data_v4['BrCaRisk%']>70), ('FamID', 'BrCaRisk%')] bottom_ext=out_data_v4.loc[(out_data_v4['Age']==80)& (out_data_v4['BrCaRisk%']<2.8), ('FamID', 'BrCaRisk%')]
X_train['int.rate'] = X_train['int.rate'].apply(lambda x: x / 100) X_test['int.rate'] = X_test['int.rate'].str.replace('%', " ").astype(float) X_test['int.rate'] = X_test['int.rate'].map(lambda x: x / 100) num_df = X_train.select_dtypes(include=[np.number]) cat_df = X_train.select_dtypes(include='object') # Code ends here # -------------- #Importing header files import seaborn as sns # Code starts here cols = num_df.columns fig, axes = plt.subplots(9, 1, figsize=(10, 10)) for i in range(0, 9): sns.boxplot(x=y_train, y=num_df[cols[i]], ax=axes[i]) # -------------- # Code starts here cols = list(cat_df.columns) fig, axes = plt.subplots(2, 2, figsize=(10, 10)) for i in range(0, 2): for j in range(0, 2): sns.countplot(x=X_train[cols[i * 2 + j]], hue=y_train, ax=axes[i, j]) # Code ends here # -------------- #Importing header files from sklearn.tree import DecisionTreeClassifier
from datetime import timedelta #load the datset data = pd.read_csv('loan_train.csv') data.head() print("csv = \n", data.head()) #checking null values in the dataset data.isnull() print("null = \n", data.isnull()) #droping unwanted data columns data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True) print("after droping column = \n ", data.head(20)) #exploring the data sns.boxplot(x='education', y='age', data=data) plt.title("education based on the age ") #plt.show() sns.distplot(data['Principal']) #plt.show() Var_Corr = data.corr() sns.heatmap(Var_Corr, xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True) #plt.show() sns.countplot(x='Gender', data=data) plt.title("count of male and female ")
def visualization(self): """ 接口请求参数 "tableName": "advertising", # str,数据库表名 "X": ["TV", "radio", "newspaper"], # list,自变量,当表格方向为h时表示多个变量名,为v时表示分类变量字段 "Y": ["sales"], # list,因变量,当表格方向为v是使用 "show_options": ["y_count", "pairs", "corr", "y_corr"], # 展示选项 "x_count": [], # list,选择要展示频率分布直方图的自变量 "box": [], # list,选择要展示箱型图的自变量 :return: """ try: res = [] self.table_data = self.table_data.astype("float") data = self.table_data.describe() res.append( transform_table_data_to_html({ "data": data.values.tolist(), "title": "描述性统计分析", "col": data.columns.tolist(), "row": data.index.tolist() })) if self.config.get("x_count") and self.config.get("x_count")[0]: for x in self.config["x_count"]: sns.distplot(self.table_data[x], kde=False) # 显示纵轴标签 plt.ylabel("frequency") # 显示图标题 # plt.title("{} - frequency distribution histogram".format(x)) res.append({ "title": "{} - 频率分布".format(x), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "y_count" in self.config["show_options"]: sns.distplot(self.table_data[self.config["Y"][0]], kde=False) # 显示横轴标签 plt.xlabel("section") # 显示纵轴标签 plt.ylabel("frequency") # 显示图标题 # plt.title("y frequency distribution histogram") res.append({ "title": "{} - 频率分布".format(self.config["Y"][0]), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if self.config.get("box") and self.config.get("box")[0]: for x in self.config["box"]: sns.boxplot(self.table_data[x], palette="Set2", orient="v") # 显示图标题 # plt.title("{} - Box distribution to check outliers".format(x)) res.append({ "title": "{} - 箱型图".format(x), "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "pairs" in self.config["show_options"]: sns.pairplot(self.table_data) # plt.title("Variable relation in pairs") res.append({ "title": "变量两两关系图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "corr" in self.config["show_options"]: corr = self.table_data.corr() sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=0.2, cmap="YlGnBu", annot=True) # plt.title("Correlation between variables") res.append({ "title": "相关系数图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) if "y_corr" in self.config["show_options"]: self.table_data.corr()[self.config["Y"][0]].sort_values( ascending=False).plot(kind='bar') # plt.title("Correlations between y and x") res.append({ "title": "因变量和各自变量的相关系数图", "base64": "{}".format(self.plot_and_output_base64_png(plt)) }) response_data = {"res": res, "code": "200", "msg": "ok!"} return response_data except Exception as e: return {"data": "", "code": "500", "msg": "{}".format(e.args)}
#%% #boxplots #first, rearrange structures in ASCENDING order (will be plotted as descending, -_-) by density and counts order = np.argsort(np.median(pcounts, axis=0))[::-1] #renaming for figure sois_sort = np.array(sois)[order][:10] #boxplots of percent counts plt.figure(figsize=(5, 4)) df = pd.DataFrame(pcounts) df.columns = sois g = sns.stripplot(data=df, color="dimgrey", orient="h", order=sois_sort) sns.boxplot(data=df, orient="h", showfliers=False, showcaps=False, boxprops={"facecolor": "None"}, order=sois_sort) plt.xlabel("% of neocortical neurons") plt.ylabel("Region") #hide the right and top spines sns.despine(top=True, right=True, left=False, bottom=False) plt.tick_params(length=6) plt.savefig(os.path.join(dst, "prv_nc_pcounts_boxplots.pdf"), bbox_inches="tight") #%%
clf = SVC() clf.fit(X_train, y_train) linear_svc = LinearSVC() print("Accuracy:{}".format(clf.score(X_test, y_test))) #Create initial prediction test = df_test[Numeric_Columns].fillna(-1000) SubMission['Survived'] = clf.predict(test) #Make first Submission SubMission.set_index("PassengerId", inplace=True) SubMission.to_csv('myFirstSubmission.csv', sep=',') fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(12, 6)) sns.boxplot(data=df_train, x="Pclass", y="Fare", ax=ax1) plt.figure(1) sns.boxplot(data=df_train, x="Embarked", y="Fare", ax=ax2) plt.show() embarked = ['S', 'C', 'Q'] for port in embarked: fare_to_impute = df_data.groupby('Embarked')['Fare'].median()[ embarked.index(port)] df_data.loc[(df_data['Fare'].isnull()) & (df_data['Embarked'] == port), 'Fare'] = fare_to_impute #Fare in df_train an df_test df_train["Fare"] = df_data["Fare"][:891] df_test["Fare"] = df_data["Fare"][891:] print("Missing Fares Estimated")
a = list(range(1, 10)) for i in b: a[i - 1] = stats.percentileofscore(summary['numberall'].to_list(), i) plt.plot(b, a) summary['numberall'].hist() df_total['averagerating'].hist() plt.scatter(df_total['startyear'], df_total['age']) #Which age does the director most often produce film year_a year_a['startyearlen'][year_a['count'] > 1].describe() year_a['startyearlen'][year_a['count'] > 1].hist() #Glimps of the data sns.boxplot(df['numVotes']) df['numVotes'].describe() #The numVotes is heavily tailed, most movies only has less than 300 people to vote np.corrcoef(df['numVotes'], df['averageRating']) sns.scatterplot(x='numVotes', y='averageRating', data=df) con = psycopg2.connect( #user name ) #create a cursor cur = conn.cursor() #execute a query cur.execute('SELECT title, directors FROM data') row = cur.fetch #commit the changes
sampled_by_month = forex_close_price.resample(rule='1m', how='last') log_returns = np.log(sampled_by_month / sampled_by_month.shift(1)) log_returns = log_returns[1:] pct_returns = sampled_by_month.pct_change() log_returns.drop(log_returns.index[0], inplace=True) pct_returns.drop(pct_returns.index[0], inplace=True) log_returns.gbpusd = log_returns.gbpusd.dropna() """ plot monthly log return of each pair at month """ monthinteger = 9 month = datetime.date(1900, monthinteger, 1).strftime('%B') ax, fig = plt.subplots(1, 1, figsize=(8, 4)) plt.title('Forex seasonality for %s' % month) plt.ylabel('Monthly log return') sns.boxplot(log_returns.ix[log_returns.index.month == monthinteger]) ax.autofmt_xdate() ax.savefig(result_dir + 'Monthly log return in %s.png' % month) """ boxplot monthly log return of a pair from Jan to Dec """ for sym in forex_list: ax, fig = plt.subplots(1, 1, figsize=(8, 4)) sns.boxplot(data=[log_returns.loc[log_returns.index.month==s, sym].dropna().values \ for s in range(1,13)]) # sns.boxplot(data=[log_returns[sym][log_returns[sym].index.month==s].dropna().values \ # for s in range(1,13)]) plt.title('seasonality in %s from 2005-2017' % sym) plt.xlabel('Months') ax.savefig(result_dir + ('seasonality in %s from 2005-2017.png' % sym)) """
plt.xticks(fontproperties=siyuanheiti) ax1.set_title('北京各大区二手房每平米单价对比',fontsize=15,fontproperties = siyuanheiti) ax1.set_xlabel('区域',fontproperties = siyuanheiti) ax1.set_ylabel('每平米单价',fontproperties = siyuanheiti) # 不同地区的二手房数量 f2,ax2 = plt.subplots(1,1,figsize=(20,7)) sns.barplot(x='Region', y='Price', palette="Greens_d", data=df_house_count, ax=ax2) plt.xticks(fontproperties=siyuanheiti) ax2.set_title('北京各大区二手房数量对比',fontsize=15,fontproperties = siyuanheiti) ax2.set_xlabel('区域',fontproperties = siyuanheiti) ax2.set_ylabel('数量',fontproperties = siyuanheiti) # 不同地区的二手总价 f3,ax3 = plt.subplots(1,1,figsize=(20,7)) sns.boxplot(x='Region', y='Price', data=df, ax=ax3) plt.xticks(fontproperties=siyuanheiti) ax3.set_title('北京各大区二手房房屋总价',fontsize=15,fontproperties = siyuanheiti) ax3.set_xlabel('区域',fontproperties = siyuanheiti) ax3.set_ylabel('房屋总价',fontproperties = siyuanheiti) # plt.show() # size特征分析 # 房间大小 f4,[ax4,ax5] = plt.subplots(1,2,figsize=(15,5)) sns.distplot(df['Size'],bins=20,ax=ax4,color='r') sns.kdeplot(df['Size'],shade=True,ax=ax4) ax4.set_title("北京各大区二手房大小分布",fontproperties=siyuanheiti) sns.regplot(x='Size',y='Price',data=df,ax=ax5) ax5.set_title("北京各大区二手房大小与价格分布",fontproperties=siyuanheiti)
y_val_fin = pd.concat(y_val_2) return X_train_fin, X_test_fin, X_val_fin, y_test_fin, y_train_fin, y_val_fin X_train, X_test, X_val, y_test, y_train, y_val = test_train_splitter(test_df) X_train = X_train.reset_index(drop=True).copy() X_test = X_test.reset_index(drop=True).copy() y_test = y_test.reset_index(drop=True).copy() y_train = y_train.reset_index(drop=True).copy() X_val = X_val.reset_index(drop=True).copy() y_val = y_val.reset_index(drop=True).copy() # In[35]: sns.boxplot(X_train['Attr27']) # In[36]: sns.distplot(X_train['Attr27']) # In[4]: X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape # In[5]:
#after looking at this we can tell that people who did not survive were more likely to be belonging to third class i.e the lowest class, the cheapest to get on to and people who did survive were more towards belonging to higher classes. #distribution plot of age of the people sns.distplot(train['Age'].dropna(), kde=False, bins=30, color='Green') #The average age group of people to survive is somewhere between 20 to 30and as older you get lesser chances of you to have on board. ##countplot of the people having siblings or spouce sns.countplot(x='SibSp',data=train) #looking at this plot we can directly tell that most people on board did not have either children, siblings or spouse on board and the second most popular option is 1which is more likely to be spouse. We have a lot of single people on board, they don’t have spouse or children. #distribution plot of the ticket fare train['Fare'].hist(color='green',bins=40,figsize=(8,4)) #It looks like most of the purchase prices are between 0 and50, which actually makes sense tickets are more distributed towards cheaper fare prices because most passengers are in cheaper third class. #%%%Data Cleaning #We want to fill in missing age data instead of just dropping the missing age data rows. One way to do this is by filling in the mean age of all the passengers. However, we can be smarter about this and check the average age by passenger class. #boxplot with age on y-axis and Passenger class on x-axis. plt.figure(figsize=(12, 7)) sns.boxplot(x='Pclass',y='Age',data=train,palette='winter'); #We can see the wealthier passengers in the higher classes tend to be older, which makes sense. We’ll use these average age values to impute based on Pclass for Age. #function def impute_age(cols): Age = cols[0] Pclass = cols[1] if pd.isnull(Age): if Pclass == 1: return 37 elif Pclass == 2: return 29 else: return 24 else: return Age #Now apply that function! train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1) #Now let’s check that heatmap again!
def plot_NO2_by_district(df): fig, ax = plt.subplots() flierprops = dict(markerfacecolor='1', markersize=8, marker='o', linestyle='none') colors = ['red', 'dodgerblue'] bplot = sns.boxplot( x='district', # vertical y='no2_2017', orient='v', hue='tipus', data=df, width=.5, palette=colors, linewidth=0.5, flierprops=flierprops, whis=[5, 95], order=[ 'Ciutat Vella', 'Eixample', 'Sants-Montjuïc', 'Les Corts', 'Sarrià-Sant Gervasi', 'Gràcia', 'Horta-Guinardó', 'Nou Barris', 'Sant Andreu', 'Sant Martí' ]) for patch in bplot.artists: r, g, b, a = patch.get_facecolor() patch.set_facecolor((r, g, b, .75)) index = 0 for i, artist in enumerate(bplot.artists): col = artist.get_facecolor() artist.set_edgecolor(col) artist.set_facecolor(col) for j in range(i * 6, i * 6 + 6): line = bplot.lines[j] if j == 4 + 6 * index: line.set_color('#ffffff') line.set_mfc('#ffffff') line.set_mec('#ffffff') index = index + 1 else: line.set_color(col) line.set_mfc(col) line.set_mec(col) ax.set_xlabel('', **label_style) ax.set_ylabel('NO2 levels', **label_style) plt.xticks(rotation=90, **ticks_style) plt.yticks(**ticks_style) plt.gca().spines["top"].set_visible(False) plt.gca().spines["bottom"].set_visible(False) plt.gca().spines["right"].set_visible(False) plt.gca().spines["left"].set_visible(False) plt.grid(axis='x', alpha=.5, linewidth=.5, color='lightgrey') plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=True) plt.tick_params(axis='y', which='both', left=False, right=False, labelleft=True) plt.legend(frameon=False, prop=legend_style) for t, l in zip(ax.get_legend().texts, ['Traffic', 'Background']): t.set_text(l) plt.tight_layout() plt.savefig('img/Figure4c.pdf', figsize=(10, 6), dpi=300)
y_score = res.LLR.values y_true = np.array([x.upper() in all_markers_norm for x in res.index]).astype('int') fpr, tpr, thresholds = roc_curve(y_true, y_score) auc_i = roc_auc_score(y_true, y_score) aucs.append([auc_i, 'spatialDE']) plt.plot(fpr, tpr, color='green') plt.show() aucs = pd.DataFrame(aucs, columns=['AUC', 'Method']) sns.boxplot(x='Method', y='AUC', data=aucs) plt.ylim(0, 1) plt.show() # %% aucs = [] plt.figure() for ff in hs_files: res = pd.read_table(ff, index_col=0) y_score = res.Z.values y_true = np.array([x.upper() in all_markers_norm for x in res.index]).astype('int')
#f, ax = plt.subplots(figsize=(6, 8)) ax[1] = sns.countplot(x=" income", data=dataset, palette="Set1") ax[1].set_title("Frequency distribution of income variable") plt.show() # Distribution of age variable f, ax = plt.subplots(figsize=(10, 8)) x = dataset['age'] ax = sns.distplot(x, bins=10, color='blue') ax.set_title("Distribution of age variable") plt.show() # Detect outliers in age variable with boxplot f, ax = plt.subplots(figsize=(10, 8)) x = dataset['age'] ax = sns.boxplot(x) ax.set_title("Visualize outliers in age variable") plt.show() # Visualize income with respect to age variable f, ax = plt.subplots(figsize=(10, 8)) ax = sns.boxplot(x=" income", y="age", data=dataset) ax.set_title("Visualize income with respect to age variable") plt.show() # Visualize income with respect to age and sex variable plt.figure(figsize=(8, 6)) ax = sns.catplot(x=" income", y="age", col=" sex", data=dataset,
fig, axs = plt.subplots(nrows=n_rows, ncols=1, figsize=[1.2 * len(dataframes) + 2, 4.0 * n_rows], sharex=True) fig.subplots_adjust(hspace=0.5) fig.suptitle("DBScan_Clustering result" + titleext, size="xx-large", weight="black") # boxplots sns.set_style("whitegrid") sns.boxplot(ax=axs[0], x="Technique", y="PuRicall", hue="Metric", data=redundancy_frame, palette="Set2") sns.stripplot(ax=axs[0], x="Technique", y="PuRicall", hue="Metric", data=redundancy_frame, palette="Set2", dodge=True, edgecolor="black", linewidth=0.3) handles, labels = axs[0].get_legend_handles_labels( ) # legend, use to only show half the legend axs[0].set_ylabel("Purity & Recall")
df.isnull().values.sum() missing_ratio = df.isnull().sum() / len(df) missing_ratio.sort_values(ascending=True)[10:] # check UA_T null_data = df[df['UA_0.5W_T'].isnull()] # drop four #df = df[(df["MVID"] != 47553) & (df["MVID"] != 46899) & (df["MVID"] != 67598) & (df["MVID"] != 45798)] df = df[~df["MVID"].isin([47553, 46899, 67598, 45798])] # check FCO null_data = df[df['FCO_0.5W_T'].isnull()] # fil missing values with group median sns.boxplot(x="CATEGORY", y="FCO_0.5W_T", data=df) # get median df[[ 'FCO_0.5W_T', 'FCO_0.5W_M24', 'FCO_0.5W_M26', 'FCO_0.5W_F24', 'FCO_0.5W_F26', 'CATEGORY' ]].groupby('CATEGORY').median() # FCO_0.5W_T def impute_FCO(cols): FCO = cols[0] GROUP = cols[1] if pd.isnull(FCO):
final_rcp85 = pd.concat(list_tmp) final_histo = final_histo.assign(Location=1) final_rcp45 = final_rcp45.assign(Location=2) final_rcp85 = final_rcp85.assign(Location=3) cdf = pd.concat([final_histo, final_rcp45, final_rcp85]) mdf = pd.melt(cdf, id_vars=['Location'], var_name=['temp_bins']) ax = sns.boxplot( x="temp_bins", y="value", hue="Location", data=mdf, showfliers=False, palette=[ sns.xkcd_rgb["medium green"], sns.xkcd_rgb["medium blue"], sns.xkcd_rgb["pale red"] ], ) # https://xkcd.com/color/rgb/ #ax.set(ylim=(0, 50)) #plt.legend(title='Smoker', loc='upper left', labels=['RCMs historical', 'RCMs rcp45 scenario', 'RCMs rcp85 scenario']) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, legends, prop={'size': 15}) ax.set_title('Outaouais Watershed: November to May', fontdict={ 'fontsize': 20, 'fontweight': 'bold' })