def plot_accuracy_with_random_by_category(true, predicted, sort=True): """plots the random Parameters: true (array): the observed values predicted (array): the observed values. sort (bool): if true sort by the Returns: res, ax """ # compute the values res = calcuate_accuracy_above_random_chance(true, predicted, sort) # set styles sns.set_style('white') sns.set_context('talk') # axes ax = sns.barplot(x='Model', y='Category', data=res, color='#c0cdf3') sns.barplot(x='Random', y='Category', data=res, color='#4f73dd', ax=ax) # configure the plot details plt.xlim(0, 100) plt.xlabel('Accuracy (%)', size=24) plt.ylabel('') ax.tick_params(axis='both', labelsize=22) sns.despine() return ax, res
def _plot_categorical_var_dist(var_data, ax, show): """Plot a boxplot of the continuous variable data inputted. This is a helper function called from plot_var_dist. It'll be used in the case that categorical data is passed in. Args: var_data: 1d numpy.ndarray ax: matplotlib.pyplot.Axes object This may or may not be None, depending on what was passed from plot_var_dist. show: bool """ var_data_counts = var_data.value_counts() var_data_percs = var_data_counts / var_data_counts.sum() if ax: sns.barplot(var_data_percs.index, var_data_percs.values, palette="BuGn_d", ax=ax) else: ax = sns.barplot(var_data_percs.index, var_data_percs.values, palette="BuGn_d") bars = ax.patches labels = var_data_percs.values _add_bar_text(ax, bars, labels) if show: plt.show()
def sentimentAccuracy(tickeName): path2 = 'resultsMKII' frame2 = call_data(tick_Name,path2) logReturn = [[],[],[]] sentiment = [] index = [] for i in range(len(frame2)): for x in range(3): logReturn[x].append(frame2[str(x+1)+' day'].values[i]) sentiment.append(frame2['Sentiment'].values[i]) index.append(i) result = {'logReturn1':pd.Series(logReturn[0],index = index), 'logReturn2':pd.Series(logReturn[1],index = index), 'logReturn3':pd.Series(logReturn[2],index = index), 'Sentiment':pd.Series(sentiment,index = index)} sns.plt.subplot(3,1,1) aw = sns.barplot(x="Sentiment",y = "logReturn1",ci=None,data = result) aw.set(xlabel='Sentiment', ylabel='Day 1') sns.plt.subplot(3,1,2) ax = sns.barplot(x="Sentiment",y = "logReturn2",ci=None,data = result) ax.set(xlabel='Sentiment', ylabel='Day 2') sns.plt.subplot(3,1,3) ay = sns.barplot(x="Sentiment",y = "logReturn3",ci=None,data = result) ay.set(xlabel='Sentiment', ylabel='Day 3') sns.plt.show()
def p6(data): f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True) sns.boxplot(x="Title", y="Age", data=data.sort_values("Age"), ax=ax1) sns.barplot(x="Title", y="Survived", data=data, ax=ax2) plt.show()
def historyEffectOnSentiment(tickeName): # from mpl_toolkits.mplot3d import Axes3D path2 = 'resultsMKII' frame2 = call_data(tick_Name,path2) logReturn = [[],[],[],[],[]] sentiment = [] index = [] for i in range(len(frame2)): for x in range(5): logReturn[x].append(frame2['-'+str(x+1)+' day'].values[i]) sentiment.append(frame2['Sentiment'].values[i]) index.append(i) result = {'logReturn1':pd.Series(logReturn[0],index = index), 'logReturn2':pd.Series(logReturn[1],index = index), 'logReturn3':pd.Series(logReturn[2],index = index), 'logReturn4':pd.Series(logReturn[3],index = index), 'logReturn5':pd.Series(logReturn[4],index = index), 'Sentiment':pd.Series(sentiment,index = index)} sns.plt.subplot(5,1,1) aw = sns.barplot(x="Sentiment",y = "logReturn1",data = result) aw.set(xlabel='Sentiment', ylabel='Day -1') sns.plt.subplot(5,1,2) ax = sns.barplot(x="Sentiment",y = "logReturn2",data = result) ax.set(xlabel='Sentiment', ylabel='Day -2') sns.plt.subplot(5,1,3) ay = sns.barplot(x="Sentiment",y = "logReturn3",data = result) ay.set(xlabel='Sentiment', ylabel='Day -3') sns.plt.subplot(5,1,4) az = sns.barplot(x="Sentiment",y = "logReturn4",data = result) az.set(xlabel='Sentiment', ylabel='Day -4') sns.plt.subplot(5,1,5) bx = sns.barplot(x="Sentiment",y = "logReturn5",data = result) bx.set(xlabel='Sentiment', ylabel='Day -5') sns.plt.show()
def get_xgb_feature_importance_plot(best_param_, experiment_, png_folder, png_fname, score_threshold=0.8): # 1. train_X, train_y = experiment_.get_train_data() clf = XGBClassifier() try: del best_param_['model_type'] except: pass clf.set_params(**best_param_) clf.fit(train_X, train_y) index2feature = clf.booster().get_fscore() fis = pd.DataFrame({'name':index2feature.keys(), 'score':index2feature.values()}) fis = fis.sort('score', ascending=False) if len(fis.index) > 20: score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold) #where_str = 'score > %f & score > %f' % (score_threshold, 0.0) where_str = 'score >= %f' % (score_threshold) fis = fis.query(where_str) # 2. plot #gs = GridSpec(2,2) #ax1 = plt.subplot(gs[:,0]) #ax2 = plt.subplot(gs[0,1]) #ax3 = plt.subplot(gs[1,1]) # 3.1 feature importance sns.barplot(x = 'score', y = 'name', data = fis, #ax=ax1, color="blue") #plt.title("Feature_Importance", fontsize=10) plt.ylabel("Feature", fontsize=10) plt.xlabel("Feature_Importance : f-Score", fontsize=10) """ # 3.2 PDF confidence_score = clf.oob_decision_function_[:,1] sns.distplot(confidence_score, kde=False, rug=False, ax=ax2) ax2.set_title("PDF") # 3.3 CDF num_bins = min(best_param_.get('n_estimators',1), 100) counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True) cdf = np.cumsum(counts) ax3.plot(bin_edges[1:], cdf / cdf.max()) ax3.set_title("CDF") ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10) """ png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname) plt.tight_layout() plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1) plt.close() return True
def clust_stability(log2_expdf_gene, iterations=16): sns.set(context='poster', font_scale = 1) sns.set_palette("RdBu_r") stability_ratio = [] total_genes = len(log2_expdf_gene.columns.tolist()) end_num = 1000 iter_list = range(100,int(round(end_num)),int(round(end_num/iterations))) for gene_number in iter_list: title= str(gene_number)+' genes plot.' top_pca = plot_PCA(log2_expdf_gene, num_genes=gene_number, title=title) top_pca_by_gene = log2_expdf_gene[top_pca] top_pca_by_cell = top_pca_by_gene.transpose() cell_linkage, plotted_df_by_gene, col_order = clust_heatmap(top_pca, top_pca_by_gene, num_to_plot=gene_number, title=title) if gene_number == 100: s1 = col_order s0 = col_order else: s2= col_order sm_running = difflib.SequenceMatcher(None,s1,s2) sm_first = difflib.SequenceMatcher(None,s0,s2) stability_ratio.append((sm_running.ratio(), sm_first.ratio())) s1=col_order plt.close() x= iter_list[1:] f, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True) y1= [m[0] for m in stability_ratio] y2= [m[1] for m in stability_ratio] sns.barplot(x, y1, palette="RdBu_r", ax=ax1) ax1.set_ylabel('Running ratio (new/last)') sns.barplot(x, y2, palette="RdBu_r", ax=ax2) ax2.set_ylabel('Ratio to 100') plt.savefig(os.path.join(filename,'clustering_stability.pdf'), bbox_inches='tight') plt.show() plt.close() return stability_ratio
def createSubOverviewPage(self): layout = QtGui.QGridLayout() w = QtGui.QWidget() sns.set(style="whitegrid") f, ax = plt.subplots(figsize=(20, 12)) canvas = figureCanvas(f) canvas.setParent(w) sns.set(style="whitegrid") q = QtSql.QSqlQuery("""SELECT EXP_DATE, SUM(AMOUNT), SUM(AMOUNT*(1+EXP_RETURN*(datediff(EXP_DATE, SETTLE_DATE)+1)/36500.0)) FROM LIABILITY WHERE EXP_DATE>='%s' GROUP BY EXP_DATE ORDER BY EXP_DATE"""%self.sysdate.date().toPyDate()) dates, vals = [], [] x_amt = range(0,1000000000,100000000) while q.next(): dates.append(q.value(0).toDate().toPyDate().isoformat()) vals.append((q.value(1).toDouble()[0], q.value(2).toDouble()[0])) data = pd.DataFrame(vals, index=dates, columns=['Amount', 'Total Return']) # Plot the total crashes sns.set_color_codes("pastel") sns.barplot(x='Total Return', y=dates, data=data, label='Interest', color="b") # Plot the crashes where alcohol was involved sns.set_color_codes("muted") sns.barplot(x='Amount', y=dates, data=data, label="Principal", color="b") # Add a legend and informative axis label ax.legend(ncol=2, loc="upper right", frameon=True) ax.set(ylabel="Maturity Date", title='Liability Overview') sns.despine(left=True, bottom=True) layout.addWidget(w, 0, 0, 1, 1) return layout
def plot_simdf_summary(simdf): f, axes = plt.subplots(2, 2, figsize=(12,8)) a1, a2, a3, a4 = axes.flatten() targets=['A', 'B', 'C', 'D'] clrs = ['#3572C6', '#c44e52', '#8172b2', '#83a83b'] targetColors = dict(zip(targets,clrs)) sns.barplot(x='choice', y='rt', data=simdf, ax=a1, order=targets, palette=targetColors) sns.barplot(x='choice', y='switch', data=simdf, ax=a2, order=targets, palette=targetColors) a1.set_ylabel('Response Time (ms)', fontsize=13) a2.set_ylabel('P(Switch)', fontsize=13) rts = simdf.groupby('choice').mean().rt.values sw = simdf.groupby('choice').mean().switch.values a1.set_ylim(rts.min()*.85, rts.max()*1.15) a2.set_ylim(sw.min()*.50, sw.max()*1.20) for i, target in enumerate(targets): tcolor=targetColors[target] tdf = simdf[simdf.choice=='target'].reset_index() sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vd'+target, ax=a3, color=tcolor) sns.timeseries.tsplot(data=simdf, time='trial', unit='agent', value='vi'+target, ax=a4, color=tcolor) a3.legend(loc=0) f.subplots_adjust(hspace=.35, wspace=.4) a3.set_ylabel('$v^G_t$', fontsize=16) a4.set_ylabel('$v^N_t$', fontsize=16) a3.set_xlabel('Trial ( $t$ )', fontsize=13) a4.set_xlabel('Trial ( $t$ )', fontsize=13) plt.subplots_adjust(wspace=.4) sns.despine()
def barplot_top_n_functions(df, n, sort_criterium='tot_time', show_std=True): ''' Barplot of the n most time consuming functions (sorted by sort_criterium) df: panda dataframe (e.g. via get_df_from_stats()) ci: confidence intervall, set to None if you don't want them returns: figure ''' tt = ('tot_time', 'mean') s_c = (sort_criterium, 'mean') # sort criterium including mean total_time = df[tt].sum() data = df.sort(columns=[s_c], ascending=False).iloc[0:n] topn_time = data[tt].sum() frac_time = topn_time / total_time if show_std: errs = data[(sort_criterium, 'std')] else: errs = None f, ax = plt.subplots(figsize=(10,5)) sns.barplot(data=data, x=s_c, y='flf', color='b', xerr=errs) sns.despine(left=True, bottom=True) ax.set(ylabel="", xlabel=sort_criterium + " [s]") # write the fraction of total time spent in these n functions rect = ax.patches[0] # last rectangle to get position of text txt = str(int(100*frac_time)) + "% of total runtime" ax.text(rect.get_width()*0.7, rect.get_height()*1.5, txt, ha="center", va="center") plt.tight_layout() return ax
def gen_abnormalities_bar(good, bad): print 'Parsing good json.' g = gen_abnormalities_data(good) print 'Total Good:', len(g) print 'Parsing bad json.' b = gen_abnormalities_data(bad) print 'Total Bad:', len(b) mcg = map(lambda x: x[0], g.most_common(25)) mcb = map(lambda x: x[0], b.most_common(25)) most_common = set(mcg + mcb) for k in g.keys(): if k not in most_common: del g[k] print 'Filtered Good:', len(g) for k in b.keys(): if k not in most_common: del b[k] print 'Filtered Bad:', len(b) gabnormalities, gcounts = zip(*g.most_common()) babnormalities, bcounts = zip(*b.most_common()) gdata = pd.DataFrame({'alignment': 'good', 'abnormality': gabnormalities, 'count': gcounts}) bdata = pd.DataFrame({'alignment': 'bad', 'abnormality': babnormalities, 'count': bcounts}) data = gdata.append(bdata).sort_values('count', ascending=False) print data sns.barplot(x='abnormality', y='count', hue='alignment', data=data)
def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, df, console): Tk.Frame.__init__(self, master) self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.evaluator = evaluator self.df = df self.console = console frame_train = Tk.Frame(self) frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15) plt.figure(figsize=(12, 20)) plt.subplot(111) # k best feature's names plt.figure(figsize=(12, 8)) plt.subplot(111) selection = SelectKBest(f_classif, k=3) selection.fit(self.x_train, self.y_train) feature_scores = selection.scores_ feature_names = df.columns.values feature_names = feature_names[feature_names != "NSP"] kbest_feature_indexes = selection.get_support() kbest_feature_names = feature_names[kbest_feature_indexes] # 存为DataFrame rec = zip(feature_scores, feature_names) data = pd.DataFrame(rec, columns=["Score", "Feature"]) sns.barplot(x="Feature", y="Score", data=data) plt.xticks(rotation=-90) plt.title("Cardiotocography Feature Scores Ranking") self.attach_figure(plt.gcf(), frame_train)
def plot_number_of_user_ratings_per_context(): nums = load_ratings_with_contexts().groupby(['user', 'context_name', 'term_type']).apply(len).reset_index().rename(columns={0: 'num'}).groupby('num').apply(len).reset_index().rename(columns={0: 'count'}) nums = nums.head(n=20) sns.barplot(x='num', y='count', data=nums, color=output.palette()[0]) plt.ylabel('Number of users') plt.xlabel('Number of ratings per context') output.savefig('number_of_ratings')
def age_histogram(df_age): age_counts = df_age.groupby('age').age.count() y = age_counts.values x = [int(age) for age in age_counts.index] f, ax = plt.subplots(1,1, figsize=(12,8)) sns.barplot(x,y, palette=sns.dark_palette('#008080', reverse=True, n_colors=60), linewidth=0) ax.set_ylabel('Postings') ax.set_xlabel('') ax.set_title('Histogram of Postings by Age') x_ticks = [0] x_ticks.extend(range(2,95, 5)) x_ticklabels = [''] x_ticklabels.extend(range(20,95,5)) ax.set_xticks(x_ticks) ax.set_xticklabels(x_ticklabels) #need to fix xlabels sns.despine(bottom=True, right=True) sns.plt.xlim(-1, 90) for i,p in enumerate(ax.patches): height = p.get_height() if ((i+18) % 5 == 0) and (i+18 < 70): ax.text(p.get_x()-1, height + 4, i+18, fontsize=18) plt.show()
def plot_heldout_prediction(input_vals, probs, fname, n=10, title=""): """Save a PNG plot visualizing posterior uncertainty on heldout data. Args: input_vals: A `float`-like Numpy `array` of shape `[num_heldout] + IMAGE_SHAPE`, containing heldout input images. probs: A `float`-like Numpy array of shape `[num_monte_carlo, num_heldout, num_classes]` containing Monte Carlo samples of class probabilities for each heldout sample. fname: Python `str` filename to save the plot to. n: Python `int` number of datapoints to vizualize. title: Python `str` title for the plot. """ fig = figure.Figure(figsize=(9, 3*n)) canvas = backend_agg.FigureCanvasAgg(fig) for i in range(n): ax = fig.add_subplot(n, 3, 3*i + 1) ax.imshow(input_vals[i, :].reshape(IMAGE_SHAPE[:-1]), interpolation="None") ax = fig.add_subplot(n, 3, 3*i + 2) for prob_sample in probs: sns.barplot(np.arange(10), prob_sample[i, :], alpha=0.1, ax=ax) ax.set_ylim([0, 1]) ax.set_title("posterior samples") ax = fig.add_subplot(n, 3, 3*i + 3) sns.barplot(np.arange(10), np.mean(probs[:, i, :], axis=0), ax=ax) ax.set_ylim([0, 1]) ax.set_title("predictive probs") fig.suptitle(title) fig.tight_layout() canvas.print_figure(fname, format="png") print("saved {}".format(fname))
def make_plots(groups): sns.stripplot("ammo", "moa", data=groups, jitter=True) postprocess() plt.savefig("points.png") plt.clf() sns.boxplot("ammo", "moa", data=groups) postprocess() plt.savefig("boxplot.png") plt.clf() sns.barplot("ammo", "mean", data=groups, ci=None) plt.title("mean moa for best 9 of 10 five shot groups") plt.ylabel("moa") postprocess() plt.savefig("avg_moa.png") plt.clf() std = groups["standard"] std = std[std.notnull()] fig, axes = plt.subplots(ncols=2) sns.distplot(std, ax=axes[0]) stats.probplot(std, plot=axes[1]) fig.set_size_inches(6, 4) fig.tight_layout() plt.savefig("qqplot.png")
def hbars(colrow,colcol,groupedData,tempcolors,title,ylab,xlab): #Input: colrow: Values alone the x axis # colcol: Values along the y-axis # groupedData: Pandas DataFrame # tempcolors: Bar colors in plot # title: Plot title # ylab: Y-label # xlab: X-label #Output: Horizontal Bar Plot with value labels the the end of each bar valuePlotting = sns.barplot(x = colcol,y = colrow,order = groupedData[colcol],data = groupedData) fig, ax = plt.subplots() #Plot Figure and axes handles fig.set_size_inches(14, 14) sns.despine() ax = sns.barplot(x = colrow,y = colcol,data = groupedData,order = groupedData[colcol], color = tempcolors) plt.setp(ax.patches, linewidth=0) ax.set_title(title,fontsize = 16) ax.set_ylabel(ylab,fontsize = 15) ax.set_xlabel(xlab,fontsize = 15) for p in valuePlotting.patches: xpos = p.get_height() height = p.get_x() if xpos > 50: t = .01 elif xpos > 8: t = .008 elif xpos > 15: t = .001 else: t = .1 ax.text(xpos + t*xpos, height+ .5, '%1.1f'%(xpos)) return fig
def plot_avg_rank_all_models(P,split_type='balancedavg1',saveout=True): ''' Generate bar plot of average rank (out of 64) of correct sketch category, by model, for a particular split. Wrapper around get_avg_rank_all_models, which itself wraps around get_avg_rank_across_samples. ''' HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_avg_rank_all_models(P,split_type=split_type) sns.set_context('talk') sns.set_style("ticks") fig = plt.figure(figsize=(4,8)) ax = fig.add_subplot(111) U = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0) sns.barplot(data=U, x='adaptor', y='target_rank', ci='sd', order = ['human_combined_cost','human_S0_cost','human_combined_nocost',\ 'multimodal_fc6_combined_cost', \ 'multimodal_fc6_S0_cost','multimodal_fc6_combined_nocost', 'multimodal_conv42_combined_cost',\ 'multimodal_pool1_combined_cost']) plt.ylabel('mean rank of congruent sketch') plt.ylim([1,32]) xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor', 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\ 'Context Cost LowAdaptor'] plt.xlabel('') l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left") plt.tight_layout() if saveout: plt.savefig('./plots/avg_rank_all_models_{}.pdf'.format(split_type))
def plot_prop_congruent_all_models(P,split_type='balancedavg1',saveout=True): ''' Generate bar plot of proportion of trials for which context-congruent sketch preferred over incongruent sketch. Wrapper around get_prop_congruent_all_models, which itself wraps around get_prop_congruent. ''' HU,H0U,H1U,MU,M0U,M1U,M2U,M3U = get_prop_congruent_all_models(P,split_type=split_type) sns.set_context('talk') fig = plt.figure(figsize=(4,8)) ax = fig.add_subplot(111) D = pd.concat([HU,H0U,H1U,MU,M0U,M1U,M2U,M3U],axis=0) sns.barplot(data=D, x='adaptor', y='sign_diff_rank',ci='sd') plt.axhline(y=0.5,linestyle='dashed',color='k') plt.ylim([0,1]) plt.ylabel('proportion context-congruent sketch preferred') xticklabels=['Context Cost Human','NoContext Cost Human','Context NoCost Human','Context Cost HighAdaptor', 'NoContext Cost HighAdaptor','Context NoCost HighAdaptor', 'Context Cost MidAdaptor',\ 'Context Cost LowAdaptor'] plt.xlabel('') l = ax.set_xticklabels(xticklabels, rotation = 90, ha="left") plt.tight_layout() if saveout: plt.savefig('./plots/prop_congruent_all_models_{}.pdf'.format(split_type))
def main(argv): parser = argparse.ArgumentParser(description='Process meme files') parser.add_argument('-i', '--meme', metavar='<meme_out>', help='Meme input file', required=True) parser.add_argument('-m', '--motif', metavar='<motif_no>', help='Motif number', required=True, type=int) parser.add_argument('-c', '--phylo', metavar='<phylo_out>', help='PhyloP conservation scores', required=True) parsed = parser.parse_args(argv) handle = open(parsed.meme) records = motifs.parse(handle, 'meme') record = records[parsed.motif-1] phylo_data = csv.reader(open(parsed.phylo,'r'), delimiter='\t') phylo_scores = [] for line in phylo_data: phylo_scores.append(float(line[2])) print "Motif length", record.length print "phylo length", len(phylo_scores) profile = position_wise_profile(record.counts, record.length) max_occur = find_max_occurence(profile, max_count=1) motif_scores = [] for position in max_occur: motif_scores.append(position[0][1]) pr = pearsonr(np.array(motif_scores), np.array(phylo_scores)) print 'Pearson correlation: {}'.format(pr) fig, ax = plt.subplots() ax= sns.regplot(y=np.array(motif_scores), x=np.array(phylo_scores), scatter=True) ax.set(ylabel="Count of most freq nucleotide", xlabel="PhyloP scores", title='CTCF | pearsonr = {}, p-val={}'.format(pr[0],pr[1])); fig.savefig('{}_motif{}_scatter.png'.format(parsed.phylo, parsed.motif)) x = np.linspace(1,len(phylo_scores)+1,num=len(phylo_scores), endpoint=False) f, (ax1, ax2) = plt.subplots(2, 1) x1 = sns.barplot(x,y=np.array(motif_scores), ax=ax1) x2 = sns.barplot(x,y=np.array(phylo_scores), ax=ax2) x1.set(ylabel='Counts of most freq nucleotide', xlabel='Position in motif') x2.set(ylabel='Phylop Score', xlabel='Position in motif') f.tight_layout() f.savefig('{}_motif{}_trend.png'.format(parsed.phylo, parsed.motif))
def animate(i): df = grouped.get_group(keys[i]).sort_values("hour") print(df.head()) ax.clear() sns.barplot(x="hour", y="lenMsgs", hue="sender", data=df, ax=ax) ax.set_title(i)
def compare_more_models(experiments): labels = sorted(experiments.keys()) results_d = pd.DataFrame(index=labels, columns=labels, dtype=float) results_s = pd.DataFrame(index=labels, columns=labels, dtype=float) results_p = pd.DataFrame(index=labels, columns=labels, dtype=float) for label1 in labels: for label2 in labels: d, s, p = compare_models(experiments[label1][0](label1), experiments[label2][0](label2), experiments[label1][1](label1), experiments[label2][1](label2), plot=False) results_d[label1][label2] = d results_s[label1][label2] = s results_p[label1][label2] = p df = pd.DataFrame(columns=["labels", "rmse"]) for label in labels: r = Evaluator(experiments[label][0](label), experiments[label][1](label)).get_report() df.loc[len(df)] = (label, r["rmse"]) plt.subplot(221) plt.title("Correlations of difficulties") sns.heatmap(results_d) plt.subplot(222) plt.title("Correlations of skills") sns.heatmap(results_s) plt.subplot(223) plt.title("Correlations of predictions") sns.heatmap(results_p) plt.subplot(224) sns.barplot(x="labels", y="rmse", data=df,)
def plot_answer_frequency_all(wrong_only=True, contexts=20, show_names=False, normalize=True, top=5): plot_cols = 4 if contexts >= 20 else 2 plot_rows = math.ceil(contexts / plot_cols) context_answers = get_context_answers()['count'].to_dict() data_all = prepare_answer_frequency_all() plot_contexts = sorted(data_all['group_name'].unique(), key=lambda c: -context_answers[c])[:contexts] data_all = data_all[data_all['group_name'].isin(plot_contexts)] if wrong_only: data_all = data_all[data_all['term_name_asked'] != data_all['term_name_answered']] if normalize: def _normalize(group): group['answer_frequency'] = group['answer_frequency'] / group['answer_frequency'].sum() return group data_all = data_all.groupby(['group_name', 'term_name_asked']).apply(_normalize) rcParams['figure.figsize'] = 7.5 * plot_cols, 5 * plot_rows for i, (group_name, data) in enumerate(data_all.groupby('group_name')): plt.subplot(plot_rows, plot_cols, i + 1) to_plot = defaultdict(list) for term, term_data in data.groupby('term_name_asked'): to_plot[term] = list(term_data['answer_frequency'].head(top).cumsum().sort_values(ascending=False, inplace=False)) terms, terms_data = zip(*sorted(to_plot.items(), key=lambda x: x[1][-1], reverse=True)) plt.title(group_name[:30]) for i in range(top): sns.barplot(list(range(len(terms))), list(map(lambda x: ([0] * (top - len(x)) + x)[i], terms_data)), color=output.palette()[i]) plt.xticks(plt.xticks()[0], terms, rotation=90) output.savefig(filename='answer_frequencies_all')
def ModelsSummary(self, df1, df2, scoring_metric): if(scoring_metric == "balanced_accuracy"): scoring_metric = "CCR" res1 = self.reshapeDf(df1, "AUC") res2 = self.reshapeDf(df2, scoring_metric) res1 = res1.sort_values(by=['AUC'], ascending=False) res2 = res2.sort_values(by=[scoring_metric], ascending=False) ################################################### sns.set(style="whitegrid") fig, ax = plt.subplots(nrows=1,ncols=2,squeeze=False,sharex=False, sharey=True) fig.suptitle("Models Performance", fontsize=20) fig.tight_layout() fig.subplots_adjust(top=0.85) fig.set_figheight(6) fig.set_figwidth(14) ax[0,0].set_title("AUC",fontsize=15) ax[0,1].set_title(scoring_metric, fontsize=15) #ax[0,0].set_xlabel(xlabel="fsadf",fontsize=24) #ax[0,1].set_xlabel(xlabel="fsadf",fontsize=24) #sns.set_context("paper", rc={"font.size":15,"axes.titlesize":10,"axes.labelsize":20}) #sns.set() sns.set_context("paper",font_scale=1.6) sns.barplot(x="Model", y="AUC", hue="Descriptor", data=res1, ax = ax[0,0]) sns.barplot(x="Model", y=scoring_metric, hue="Descriptor", data=res2, ax = ax[0,1]) fig.savefig(fname=self.out_df_path + "SB_models_performance_summary_both.png" , dpi=400 ,format="png") fig.clf() return res1, res2
def plot_bar_counts(data): fig = plt.figure(figsize=(20,10)) plt.yticks(fontsize=8) plt.xticks(rotation=90) sns.barplot(x=data.keys().values,y=data.values) plt.xlabel('') plt.ylabel('Number of jobs',fontsize=10)
def backline_esc_by_region(data, component_chart_dir, component): textcolor='black' palette=['#aad962','#fbbf45','#ef6a32'] plt.clf() plt.rcParams['figure.figsize']=(25,15) f, ax = plt.subplots(3) ax[0].set_title("Total " + component + " BL Escalations by Regional COEs", fontsize=40) f.subplots_adjust(hspace=0.4) #f.tight_layout() for idx, comp in enumerate(["EMEA", "Americas", "APJ"]): if comp == "EMEA": regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])] elif comp == "Americas": regional_data = data[data['region'].isin(['America','Atlantic'])] elif comp == "APJ": regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])] regional_data = regional_data.groupby(['month']).sum().reset_index() #display(regional_data) if not regional_data.empty: sns.barplot(x="month", y="bl_esc_count", data=regional_data, color=palette[idx], ax=ax[idx], errcolor='None') ax[idx].set_ylabel(comp,fontsize=25, color=textcolor) ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) ax[idx].set_xlabel("") for label in ax[idx].get_xticklabels(): label.set_rotation(15) label.set_color(textcolor) for label in ax[idx].get_yticklabels(): label.set_color(textcolor) plt.savefig(component_chart_dir + '/total_regional_bl_esc.png') #plt.show() plt.close() return
def plot_parts(self, x, groups=None): """ Plots individual parameter importance Parameters ---------- x: (p,) array input variables (p features) groups: dict group variables under a common name """ if groups is None: groups = {} p, fp, b = self.predict(x) features = [''] * (len(self.feat_map) + 1) for f, i in self.feat_map.items(): features[i] = f features[-1] = 'bias' parts = np.r_[fp, b] df = pd.DataFrame({'participation': parts, 'feature': features}) df['group'] = df.feature.apply(lambda f: groups.get(f, f)) df = df.groupby('group', as_index=False).sum() df['abs_participation'] = df['participation'].abs() sns.barplot(x='participation', y='group', data=df.sort_values('abs_participation', ascending=False))
def simple_barplot(xlabel, ylabels, stype, df, filename, exts): ## how to have multiple y values? pass list? with sns.axes_style('ticks'): fig = plt.figure() df = complete_df(df, xlabel) if len(ylabels) > 1: df = tidy_df(df, xlabel, ylabels, stype) ylabel = 'missing percentage' plot = sns.barplot(x=xlabel, y='value', hue=stype, data=df) else: ylabel = ylabels[0] if stype == "coverage": plot = sns.barplot(x=xlabel, y=ylabels[0], data=df, color="green") #palette=sns.light_palette("green")) else: plot = sns.barplot(x=xlabel, y=ylabels[0], data=df) if stype in ["coverage", "nas"]: plot.set_ylim([0, 100]) sns.despine() plt.ylabel(ylabel) if xlabel == "year": plt.setp(plot.get_xticklabels(), rotation=45) plt.xlabel(xlabel) fig.add_axes(plot) fig.tight_layout() ### TODO: call complete_df() and tidy_df() for ext in exts: save_plot(fig, outpath, filename, '.'+ext) plt.close() return
def owner_barcharts(data, component_chart_dir, multi_month=False): textcolor='black' palette=['#aad962','#fbbf45','#ef6a32'] plt.clf() plt.rcParams['figure.figsize']=(25,15) f, ax = plt.subplots(3) for idx, comp in enumerate(["EMEA", "Americas", "APJ"]): if comp == "EMEA": regional_data = data[data['region'].isin(['Africa','Europe','GMT',''])] elif comp == "Americas": regional_data = data[data['region'].isin(['America','Atlantic'])] elif comp == "APJ": regional_data = data[data['region'].isin(['Australia','Asia','Pacific'])] if multi_month: sorted_hue = sorted(regional_data.month.unique()) sns.barplot(x="name", y="total_count", hue="month", hue_order=sorted_hue, data=regional_data, color=palette[idx], ax=ax[idx]) else: sns.barplot(x="name", y="total_count", data=regional_data, color=palette[idx], ax=ax[idx]) ax[idx].set_ylabel(comp,fontsize=25, color=textcolor) ax[idx].yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) ax[idx].set_xlabel("") for label in ax[idx].get_xticklabels(): label.set_rotation(15) label.set_color(textcolor) for label in ax[idx].get_yticklabels(): label.set_color(textcolor) plt.show() plt.close() return
def plotMICHist(): f, ax = plt.subplots(figsize=(12, 6)) sns.barplot(ks, vs, palette="BuGn_d", ax=ax) ax.set_ylabel("MIC") plt.xticks(rotation=90) f.subplots_adjust(bottom=0.2) plt.show()
axis=0) df_anova['choice'] = 1 - df_anova['choice'] df_anova['qn'] = qn df_anova['choice'] = pd.to_numeric(df_anova['choice']) print('####### question: ' + q1 + ' #######') F, p = stats.f_oneway(df_anova['choice'][df_anova['experiment'] == 0], df_anova['choice'][df_anova['experiment'] == 1], df_anova['choice'][df_anova['experiment'] == 2]) print('ANOVA: %.2f, %.4f' % (F, p)) if p < .05: res = pairwise_tukeyhsd(df_anova['choice'], df_anova['experiment']) print(res) df_anova_qn = pd.concat((df_anova_qn, df_anova), axis=0) sns.barplot(data=df_anova_qn, x='qn', y='choice', hue='experiment') questions_df_multilinear = pd.melt(questions_df, id_vars=['q'], value_vars=questions_df.columns[:-1], var_name='experiment', value_name='percentage') questions_df_multilinear.to_csv('data/paper/00questions_df_multilinear.csv') ### plotting the regression lines for all the frequency figure, ax = plt.subplots(1, 1) for x in questions_df.columns: if x != 'q': sns.regplot(x='q', y=x, data=questions_df, label=x, ax=ax) ax.set_ylabel('Prefer towards rational') plt.legend()
HT_regular = hashtag_extract(train['tweet'][train['label'] == 0]) # extracting hashtags from racist/sexist tweets HT_negative = hashtag_extract(train['tweet'][train['label'] == 1]) # unnesting list HT_regular = sum(HT_regular, []) HT_negative = sum(HT_negative, []) a = nltk.FreqDist(HT_regular) d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())}) # selecting top 20 most frequent hashtags d = d.nlargest(columns="Count", n=20) plt.figure(figsize=(16, 5)) ax = sns.barplot(data=d, x="Hashtag", y="Count") ax.set(ylabel='Count') plt.show() a = nltk.FreqDist(HT_negative) d = pd.DataFrame({'Hashtag': list(a.keys()), 'Count': list(a.values())}) # selecting top 20 most frequent hashtags d = d.nlargest(columns="Count", n=20) plt.figure(figsize=(16, 5)) ax = sns.barplot(data=d, x="Hashtag", y="Count") ax.set(ylabel='Count') plt.show() # tokenizing the words present in the training set tokenized_tweet = train['tweet'].apply(lambda x: x.split())
label[label.isin(['WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS'])], 'subject' ])['duration'].count() * 1.28) duration_df = pd.DataFrame(duration_df) # Sort the values of duration plot_data = duration_df.reset_index().sort_values('duration', ascending=False) plot_data['Activity'] = plot_data['Activity'].map({ 'WALKING_UPSTAIRS': 'Upstairs', 'WALKING_DOWNSTAIRS': 'Downstairs' }) # Plot the durations for staircase use plt.figure(figsize=(15, 5)) sns.barplot(data=plot_data, x='subject', y='duration', hue='Activity') plt.title('Participants Compared By Their Staircase Walking Duration') plt.xlabel('Participants') plt.ylabel('Total Duration [s]') plt.show() # -------------- #exclude the Activity column and the subject column feature_cols = data.columns[:-2] #Calculate the correlation values correlated_values = data[feature_cols].corr() #stack the data and convert to a dataframe correlated_values = (correlated_values.stack().to_frame().reset_index().rename( columns={
print(diabetes_df.head()) diabetes_df.info() diabetes_df.isnull().sum() corr = diabetes_df.corr() print(corr) sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) plt.subplots(figsize=(18, 15)) plt.subplot(4, 3, 1) plt.subplots_adjust(wspace=0.2, hspace=0.5) sns.countplot(x='Outcome', data=diabetes_df) plt.subplot(4, 3, 2) plt.subplots_adjust(wspace=0.2, hspace=0.5) sns.barplot(x='Outcome', y='Age', data=diabetes_df) plt.show() #data analysis columns = diabetes_df.columns[:8] # print(columns) plt.subplots(figsize=(18, 15)) length = len(columns) for i, j in itertools.zip_longest(columns, range(length)): plt.subplot((length / 2), 3, j + 1) plt.subplots_adjust(wspace=0.2, hspace=0.5) diabetes_df[i].hist(bins=20, edgecolor='black') plt.title(i) plt.show() #analysis of diabetic classes
# drop Parch & SibSp data_train = data_train.drop(['SibSp', 'Parch'], axis=1) data_test = data_test.drop(['SibSp', 'Parch'], axis=1) import seaborn as sns sns.set_style('whitegrid') get_ipython().magic(u'matplotlib inline') # plot fig, (axis1, axis2) = plt.subplots(1, 2, sharex=True, figsize=(10, 5)) sns.countplot(x='Family', data=data_train, order=[1, 0], ax=axis1) family_perc = data_train[["Family", "Survived"]].groupby(['Family'], as_index=False).mean() sns.barplot(x='Family', y='Survived', data=family_perc, order=[1, 0], ax=axis2) axis1.set_xticklabels(["With Family", "Alone"], rotation=0) # In[ ]: #cabin #看看这个值的有无,对于survival的分布状况,影响如何 fig = plt.figure(figsize=(13, 7)) fig.set(alpha=0.5) # 设定图表颜色alpha参数 Survived_cabin = data_train.Survived[pd.notnull( data_train.Cabin)].value_counts() Survived_nocabin = data_train.Survived[pd.isnull( data_train.Cabin)].value_counts() df = pd.DataFrame({
if name in acc_dict: acc_dict[name] += acc else: acc_dict[name] = acc # print '{0}: {1}'.format(name, acc * 100) for clf in acc_dict: acc_dict[clf] = acc_dict[clf] / 10.0 log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols) log = log.append(log_entry) plt.xlabel('Accuracy') plt.title('Classifier Accuracy') # plt.show() sns.set_color_codes("muted") sns.barplot(x='Accuracy', y='Classifier', data=log, color="b") from operator import itemgetter sorted_dict = sorted(acc_dict.items(), key=itemgetter(1), reverse=True) for k, v in sorted_dict: print "{0}-{1:.2%}".format(k, v) ntrain = X_train.shape[0] ntest = y_test.shape[0] SEED = 0 # for reproducibility NFOLDS = 5 # set folds for out-of-fold prediction kf = KFold(ntrain, n_folds=NFOLDS, random_state=SEED) class SklearnHelper(object): def __init__(self, clf, seed=0, params=None):
# ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax ) # ax.set_title("PPA data") # # ax = fig.add_subplot(122) # tmp=df[df["test_set"]=="EPA"] # ax=sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="weather_data_used", data=tmp, ax=ax ) # ax.set_title("EPA data") # # plt.show() fig = plt.figure(figsize=(20, 10)) ax = fig.add_subplot(121) tmp = df[df["weather_data_used"] == True] ax = sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="test_set", data=tmp, ax=ax) ax.set_title("with weather data") ax = fig.add_subplot(122) tmp = df[df["weather_data_used"] == False] ax = sns.barplot(x="smoothing_window(hour)", y="RMSE", hue="test_set", data=tmp, ax=ax) ax.set_title("no weather data") plt.show()
df.isnull().sum() # # Visualization # In[91]: import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[92]: sns.barplot(x=df['Gender'], y=df['Loan_Status'], data=df, label="Relationship among Gender and Loan Approval Status", ci=None) # In[93]: sns.barplot(x=df["Married"], y=df['Loan_Status'], data=df, label="Relationship among Gender and Loan Approval Status", ci=None) # In[94]: sns.catplot(x="Married", y="ApplicantIncome",
# %% import graphviz with open("tree.dot") as f: dot_graph = f.read() graphviz.Source(dot_graph) # %% import seaborn as sns import numpy as np print("Feature importances:\n{0}".format( np.round(dt_clf.feature_importances_, 3))) for name, value in zip(iris_data.feature_names, dt_clf.feature_importances_): print("{0} : {1:.3f}".format(name, value)) sns.barplot(x=dt_clf.feature_importances_, y=iris_data.feature_names) sns.despine() # %% import pandas as pd import matplotlib.pyplot as plt feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+', header=None, names=['column_index', 'column_name']) def get_new_feature_name_df(old_feature_name_df): feature_dup_df = pd.DataFrame( data=old_feature_name_df.groupby('column_name').cumcount(),
train_predict = cl.predict(a_test) acct = accuracy_score(b_test, train_predict) if name in acct_dict: acct_dict[name] += acct else : acct_dict[name] = acct for cl in acct_dict: acct_dict[cl] = acct_dict[cl]/10.0 log_entry = pd.DataFrame([[cl, acct_dict[cl]]], columns=["Classifier", "Accuracy"]) log= log.append(log_entry) plt.xlabel('Accuracy') plt.title('Classifier Accuracy') sns.set_color_codes('muted') sns.barplot(x="Accuracy", y ="Classifier", data=log, color="b")
def train_model_classification(X, X_test, y, params, folds, model_type='lgb', eval_metric='auc', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns == None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = {'auc': {'lgb_metric_name': eval_auc, 'catboost_metric_name': 'AUC', 'sklearn_scoring_function': metrics.roc_auc_score}, } result_dict = {} # out-of-fold predictions on train data oof = np.zeros((len(X), len(set(y.values)))) # averaged predictions on train data prediction = np.zeros((len(X_test), oof.shape[1])) # list of scores on folds scores = [] feature_importance = pd.DataFrame() # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid) y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=n_estimators, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric]['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid[:, 1])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) prediction /= folds.n_splits print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= folds.n_splits cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)); sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)); plt.title('LGB Features (avg over folds)'); result_dict['feature_importance'] = feature_importance return result_dict
#Algorithm Random Forest #Visualize important features import matplotlib.pyplot as plt import seaborn as sns #%matplotlib inline pd.set_option('display.max_rows', 350) pd.set_option('display.max_columns', 350) plt.style.use('ggplot') feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False) # Creating a bar plot, displaying only the top k features k=20 sns.barplot(x=feature_imp[:20], y=feature_imp.index[:k]) # Add labels to your graph plt.xlabel('Feature Importance Score') plt.ylabel('Features') plt.title("Visualizing Important Features") plt.legend() plt.show() # List top k important features k=20 feature_imp.sort_values(ascending=False)[:k] #Algorithm Random Forest #Select the top important features, set the threshold # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than 0.03
#dans l'opération précédente la date et le channel sont partis dans l'index dateChannel_data = dateChannel_data.reset_index( ) #recrée les colonnes date et channel dateChannel_data['Année'] = dateChannel_data['date'].astype( str).str[:4] #ajoute l'année dateChannel_data.info() dateChannel_data.sort_values(by=['date', 'channel']) dateChannel_data.head(20) ########################################################################## # Graphique en barre général Répartition du trafic selon les canaux. ########################################################################## sns.set() #paramètres esthétiques ressemble à ggplot par défaut. fig, ax = plt.subplots() #un seul plot sns.barplot(x='channel', y='pageviews', data=dateChannel_data, estimator=sum, order=sorted(dfPVChannel['channel'].unique())) fig.suptitle( "Le canal 'search' est le premier contributeur en termes de trafic.", fontsize=14, fontweight='bold') ax.set( xlabel="Canal", ylabel="Pages vues", title="Le canal 'direct' (fourre tout) est malheureusement important aussi." ) fig.text(.35, -.03, "Trafic Global - Pages vues selon les canaux depuis 2011", fontsize=9)
def plot(data, columns, measureName, nrows, ncols, order=None): f, axes = plt.subplots(nrows=nrows, ncols=ncols) axes = axes.reshape(-1) for i, c in enumerate(columns): sns.barplot(x=measureName, y=c, data=data, order=order, ax=axes[i]) sns.plt.show()
print(countries_df.head()) analysis_dict = {'variables': list(countries_df.columns.values), 'count': list(countries_df.count().values), 'v_types': list(countries_df.dtypes.values), 'n_null': list(countries_df.isnull().sum().values), 'n_uniques': list(countries_df.nunique().values)} analysis = pd.DataFrame(analysis_dict) print(analysis) brazil_land_type_df = countries_df.loc[countries_df['countryName'] == 'Brazil', ['year', 'cropLand', 'grazingLand', 'forestLand']].set_index('year') ax1 = sns.lineplot(data=brazil_land_type_df, ci=None, legend='brief') ax1.set_title('Brazil ground types areas evolution over the years') ax1.set_ylabel('Areas (gha)') ax1.set_xlabel('Years') plt.show() fishingGround_df = countries_df.loc[(countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['countryName', 'fishingGround']].nlargest(40, 'fishingGround') ax2 = sns.barplot(x='fishingGround', y='countryName', data=fishingGround_df) ax2.set_title('2016 world top40 fishing ground') ax2.set_xlabel('Fishing Ground (gha)') plt.show() portugal_area_df = countries_df.loc[(countries_df['countryName'] == 'Portugal') & (countries_df['year'] == 2016) & (countries_df['record'] == 'AreaTotHA'), ['cropLand', 'grazingLand', 'forestLand']].T ax3 = portugal_area_df.plot.pie(y=160552, title='Portugal 2016 ground types area', legend=False) ax3.set_ylabel('') plt.show()
# In[8]: #Let's find the columns with any null objects and plot them based on their count data_na = train.isnull().sum() data_na = data_na[data_na > 0] data_na = data_na.to_frame() data_na.columns = ['count'] data_na.index.names = ['name'] data_na['name'] = data_na.index # In[9]: #Plotting a bar plot of number null objects in each column plt.figure(figsize=(25, 8)) sns.set(style='ticks') sns.barplot(x='name', y='count', data=data_na) plt.show() # In[10]: #The numerical features of data. We will used the median of each and group movies based on production companies. train.select_dtypes(include=[np.number]).columns # In[11]: #First drop columns with more than 80% null values train = train.dropna(thresh=0.80 * len(train), axis=1) # In[12]: #Check now to see which columns still have null values
cm_6=confusion_matrix(y_true=y_test,y_pred=model.predict(X_test)) acc_6=accuracy_score(y_test,model.predict(X_test)) dic["XG Boost"]=acc_6 #Analysing the results of the Results Estimators=[] Accuracy=[] for i in dic: Estimators.append(i) Accuracy.append(dic[i]*100) d={'Estimators':Estimators,"Accuracy":Accuracy} df=pd.DataFrame(data=d) plt.figure(num=3) plt.ylim(0,100) plt.title("All classification estimators with accuracy score") sns.barplot(x='Estimators',y='Accuracy',data=df) plt.show() #Finalizing the model model=tree_final print("Enter the Value of X acceleration: ") x=int(input()) print("Enter the Value of Y acceleration: ") y=int(input()) f=sc.transform(np.array([[x,y]])) x=f[0][0] y=f[0][1] t=model.predict(np.array([[x,y]])) if(t==0): print("The phone has fallen")
Y=c1.iloc[:,10] from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.5,random_state=0) from sklearn.preprocessing import StandardScaler sc=StandardScaler() X_train=sc.fit_transform(X_train) X_test=sc.fit_transform(X_test) from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0) classifier.fit(X_train,Y_train) from sklearn.metrics import confusion_matrix,classification_report,accuracy_score pred=classifier.predict(X_test) accuracy=accuracy_score(Y_test,pred) print("Accuracy:",accuracy*100) cm=confusion_matrix(Y_test,pred) cm cls1=classification_report(Y_test,pred) print(cls1) sns.pairplot(c1) classifier.estimators_ sns.heatmap(cm, annot=True) import graphviz dot_data = tree.export_graphviz(clf, out_file='tree.dot') classifier.feature_importances_ featureimp=pd.Series(classifier.feature_importances_).sort_values(ascending=True) print(featureimp) sns.barplot(x=round(featureimp,4),y=featureimp) plt.xlabel("Feature importance") plt.show() i
def rdm_compare(rdms, models, comp=None, plot=None): '''function to compare target and model rmds''' import pandas as pd from scipy.spatial import distance from nilearn.connectome import sym_matrix_to_vec, vec_to_sym_matrix from scipy.stats import rankdata, spearmanr, kendalltau, pearsonr, mstats import numpy as np from itertools import combinations import pickle import seaborn as sns import matplotlib.pyplot as plt import copy if isinstance(rdms, str) is True: with open(rdms, 'rb') as f: dict_rdms = pickle.load(f) target_rdms = copy.deepcopy(dict_rdms['rdm']) target_conds = target_rdms[0].keys() else: target_rdms = rdms target_conds = rdms[0].keys() if isinstance(models, str) is True: with open(models, 'rb') as f: dict_models = pickle.load(f) models = dict_models['rdm'] model_ids = dict_models['id'] else: models = models for rdm in dict_models['rdm']: if 'Unnamed: 0' in rdm: del rdm['Unnamed: 0'] for index, rdm in enumerate(target_rdms): target_rdms[index] = target_rdms[index].as_matrix() list_cor_rdm = list(range(0, len(target_rdms))) list_p = list(range(0, len(target_rdms))) target_rdms_trans = list(range(0, len(target_rdms))) if comp is None or comp == 'spearman': for index, rdm in enumerate(target_rdms): target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm))) rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds) for index, part_rdm in enumerate(target_rdms_trans): list_cor_rdm[index], list_p[index] = spearmanr(part_rdm.flatten(), rdm_avg.as_matrix().flatten()) list_cor_sub = list() list_cor_rdm_sub = list() list_p_sub = list() for index, part in enumerate(target_rdms_trans): tmp_rdms = target_rdms_trans.copy() tmp_part = target_rdms_trans[index] tmp_rdms.pop(index) tmp_rdm_avg = np.mean(tmp_rdms, axis=0) list_cor_sub.append(spearmanr(tmp_part.flatten(), tmp_rdm_avg.flatten())) for i, cor in enumerate(list_cor_sub): list_cor_rdm_sub.append(cor.correlation) list_p_sub.append(cor.pvalue) elif comp == 'kendalltaua': for index, rdm in enumerate(target_rdms): target_rdms_trans[index] = vec_to_sym_matrix(rankdata(sym_matrix_to_vec(rdm))) rdm_avg = pd.DataFrame(np.mean(target_rdms, axis=0), columns=target_conds) for index, part_rdm in enumerate(target_rdms): list_cor_rdm[index], list_p[index] = kendalltau(part_rdm.flatten(), rdm_avg.as_matrix().flatten()) list_cor_sub = list() list_cor_rdm_sub = list() list_p_sub = list() for index, part in enumerate(target_rdms): tmp_rdms = target_rdms.copy() tmp_part = target_rdms[index] tmp_rdms.pop(index) tmp_rdm_avg = np.mean(tmp_rdms, axis=0) list_cor_sub.append(kendalltau(tmp_part.flatten(), tmp_rdm_avg.flatten())) for i, cor in enumerate(list_cor_sub): list_cor_rdm_sub.append(cor.correlation) list_p_sub.append(cor.pvalue) elif comp == 'pearson': for index, rdm in enumerate(target_rdms): target_rdms_trans[index] = vec_to_sym_matrix(mstats.zscore(sym_matrix_to_vec(rdm))) rdm_avg = pd.DataFrame(np.mean(target_rdms_trans, axis=0), columns=target_conds) for index, part_rdm in enumerate(target_rdms_trans): list_cor_rdm[index], list_p[index] = pearsonr(part_rdm.flatten(), rdm_avg.as_matrix().flatten()) list_cor_sub = list() list_cor_rdm_sub = list() list_p_sub = list() for index, part in enumerate(target_rdms_trans): tmp_rdms = target_rdms_trans.copy() tmp_part = target_rdms_trans[index] tmp_rdms.pop(index) tmp_rdm_avg = np.mean(tmp_rdms, axis=0) list_cor_sub.append(pearsonr(tmp_part.flatten(), tmp_rdm_avg.flatten())) for i, cor in enumerate(list_cor_sub): list_cor_rdm_sub.append(cor[0]) list_p_sub.append(cor[1]) upper_noise_ceiling = np.mean(list_cor_rdm) lower_noise_ceiling = np.mean(list_cor_rdm_sub) model_comp = pd.DataFrame(columns=['participant', 'models', 'cor'], index=np.arange(len(dict_models['id']) * len(dict_rdms['id']))) model_comp['participant'] = dict_rdms['id'] * len(dict_models['id']) model_comp['models'] = sorted(dict_models['id'] * len(dict_rdms['id'])) list_cor_models = list() snd_rdms = list() snd_rdms.append(rdm_avg.as_matrix()) for mod_rdm in models: snd_rdms.append(mod_rdm.as_matrix()) ids_rdms = list() ids_rdms.append('group average') for mod_ids in model_ids: ids_rdms.append(mod_ids) if comp is None or comp == 'spearman': for index, model_rdm in enumerate(dict_models['rdm']): for i, sub_rdm in enumerate(target_rdms_trans): list_cor_models.append(spearmanr(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation) rdms_dist = [spearmanr(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)] rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms) elif comp == 'kendalltaua': for index, model_rdm in enumerate(dict_models['rdm']): for i, sub_rdm in enumerate(target_rdms): list_cor_models.append(kendalltau(sub_rdm.flatten(), model_rdm.as_matrix().flatten()).correlation) rdms_dist = [kendalltau(x.flatten(), y.flatten()).correlation for x, y in combinations(snd_rdms, 2)] rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms) elif comp == 'pearson': for index, model_rdm in enumerate(dict_models['rdm']): for i, sub_rdm in enumerate(target_rdms_trans): list_cor_models.append(pearsonr(sub_rdm.flatten(), model_rdm.as_matrix().flatten())[0]) rdms_dist = [pearsonr(x.flatten(), y.flatten())[0] for x, y in combinations(snd_rdms, 2)] rdms_dist = pd.DataFrame(distance.squareform(rdms_dist), columns=ids_rdms) model_comp['cor'] = list_cor_models if plot is None: print('results will not be plotted') elif plot == 'bar': ax = sns.barplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp) plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1) plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1) rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r', alpha=0.5) ax.set_xticklabels(labels=list(dict_models['id'])) if comp is None or comp == 'spearman': ax.set(ylabel='spearman correlation with target RDM') if comp == 'pearson': ax.set(ylabel='pearson correlation with target RDM') if comp == 'kendalltaua': ax.set(ylabel='kendall tau a correlation with target RDM') ax.add_patch(rect) plt.tight_layout() elif plot == 'violin': ax = sns.violinplot(x=model_comp['models'], y=model_comp['cor'], data=model_comp) plt.plot(np.linspace(-20, 120, 1000), [upper_noise_ceiling] * 1000, 'r', alpha=0.1) plt.plot(np.linspace(-20, 120, 1000), [lower_noise_ceiling] * 1000, 'r', alpha=0.1) rect = plt.Rectangle((-20, lower_noise_ceiling), 10000, (upper_noise_ceiling - lower_noise_ceiling), color='r', alpha=0.5) ax.set_xticklabels(labels=list(dict_models['id'])) if comp is None or comp == 'spearman': ax.set(ylabel='spearman correlation with target RDM') if comp == 'pearson': ax.set(ylabel='pearson correlation with target RDM') if comp == 'kendalltaua': ax.set(ylabel='kendall tau a correlation with target RDM') ax.add_patch(rect) plt.tight_layout() return rdm_avg, model_comp, rdms_dist
for g1 in grps: for g2 in grps[grps.index(g1) + 1:]: if g1 != g2: keys.append(str(g1 + '_' + g2)) x = list(df.loc[g1, :]) y = list(df.loc[g2, :]) res = mannwhitneyu(x, y, alternative='two-sided') res_df = res_df.append({ 'statistic': res[0], 'p-value': res[1] }, ignore_index=True) res_df = res_df.set_index(pd.Index(keys, 'Tissue')) corrected_p_values = multipletests(res_df['p-value'])[1] res_df['cor_p-value'] = pd.Series(corrected_p_values, index=keys) res_df = res_df.sort_values(by='cor_p-value') df['Average Accuracy'] = df.mean(axis=1) df['sdev'] = df.std(axis=1) plt.figure(figsize=(10, 6)) ax = sns.barplot(x=df.index.values, y=df['Average Accuracy'], yerr=df['sdev'] * 1, capsize=.2) x = ax.set_title("Average Model Accuracies") x = ax.set_xlabel("Tissues") x = ax.set_ylabel("Average Accuracy - 1 SD") x = ax.set_xticklabels(labels=df.index.values, rotation=38) fig = ax.get_figure() fig.savefig("plots/all_model_accuracy.png", dpi=100, bbox_inches="tight") from IPython.display import display, HTML display(res_df.head(10))
# In[ ]: m = sns.distplot(dataset["Fare"], color="r", label="Skewness : %.2f" % (dataset["Fare"].skew())) m = m.legend(loc="best") # skewness is reduced # ### 3.2 Categorical values # #### Sex # In[ ]: g = sns.barplot(x="Sex", y="Survived", data=train) g = g.set_ylabel("Survival Probability") # Females have a high rate of Survival # In[ ]: # See the two groups data ratio train[["Sex", "Survived"]].groupby('Sex').mean() # It shows clearly that Female have more chance to survive than Male. # So Sex, will play an important role in the prediction of the survival. # #### Pclass # In[ ]:
#################################### fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6)) sns.scatterplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0]) sns.scatterplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1]) sns.scatterplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2]) plt.tight_layout() plt.show() # %% [markdown] # ### 3.2.3 Bar plot # %% fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 6)) sns.barplot(x=diamond['cut'], y=diamond['price'], color='r', ax=ax[0]) sns.barplot(x=diamond['color'], y=diamond['price'], color='y', ax=ax[1]) sns.barplot(x=diamond['clarity'], y=diamond['price'], color='g', ax=ax[2]) # %% [markdown] # Insight:<br> # 1. cut at 4 level is best deal price;<br> # 2. color at 1 is best average price, which is weird, because the worst color has the best price.<br> # Probably most customers are hard to tell which is the better color;<br> # 3. clarity 2 is the best price, which is weird too, as the clarity 2 is not a good level. # %% print(diamond['price'][diamond['color'] == 5].mean()) print(diamond['carat'][diamond['color'] == 5].mean()) # %%
#Adding a grid for better visualization plt.grid() #%% #Visualization - 2 sns.set(style="white", context="talk") # Set up the matplotlib figure f2, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(7, 5), sharex=True) # Generate some sequential data x = [60, 90, 120] y1 = y_past_1 sns.barplot(x=x, y=y1, palette="Reds", ax=ax1).set_title('Corridor 1') ax1.axhline(0, color="k", clip_on=False) ax1.set_ylabel("Pasture") # Generate some sequential data x = [60, 90, 120] y1 = y_agri_1 sns.barplot(x=x, y=y1, palette="Reds", ax=ax2) ax2.axhline(0, color="k", clip_on=False) ax2.set_ylabel("Agriculture") # Generate some sequential data x = [60, 90, 120] y1 = y_exp_s_1
zf = pd.DataFrame(zf, columns=columns) # # 重新审视数据集 display(zf.head(n=2)) # 对二手房区域分组对比二手房数量和每平米房价 df_house_count = zf.groupby('Region')['price'].count().sort_values( ascending=False).to_frame().reset_index() df_house_mean = zf.groupby('Region')['perPrice'].mean().sort_values( ascending=False).to_frame().reset_index() f, [ax3, ax1, ax2] = plt.subplots(3, 1, figsize=(20, 15)) sns.barplot(x='Region', y='perPrice', palette="Blues_d", data=df_house_mean, ax=ax1) ax1.set_title('深圳各个区域的每平方米的租金对比', fontsize=15) ax1.set_xlabel('region', rotation=80, fontsize=1) ax1.set_ylabel('unit price') sns.barplot(x='Region', y='price', palette="Greens_d", data=df_house_count, ax=ax2) ax2.set_title('深圳各个区域的出租房数量对比', fontsize=15) ax2.set_xlabel('region') ax2.set_ylabel('quantity')
plt.plot(job_admin.keys(), job_admin.values, label="admin") plt.plot(job_technician.keys(), job_technician.values, label="technician") plt.plot(job_blue.keys(), job_blue.values, label="blue-collar") plt.plot(job_entrepreneur.keys(), job_entrepreneur.values, label="entrepreneur") plt.plot(job_management.keys(), job_management.values, label="management") plt.plot(job_retired.keys(), job_retired.values, label="retired") plt.ylabel("No of employees") plt.xlabel("Age range") plt.title("Jobs VS age (Fig3)") plt.legend() plt.show() # Bar plot for Job and salary using seaborn sns.barplot(x="job", y="salary", hue="marital", data=data) plt.xticks(rotation=45) plt.title("Jobs and salaries (Fig4)") plt.show() # pie chart for job plt.pie(data["job"].value_counts().values, autopct='%1.2f%%', labels=data["job"].value_counts().keys()) plt.title("Pie chart of Job (Fig 5)") plt.show() # Scatter plot between salary and age plt.scatter(data["salary"], data["age"], color="red", alpha=0.5) plt.xlabel("salary") plt.ylabel("age")
data_train["Embarked"].fillna(data_train["Embarked"].mode().iloc[0], inplace=True) print("==================Feture Fill End===================") print(data_train.info()) # 查看不同特性下生存概率是否与特性有关系 plt.figure(figsize=(15, 10)) view_feature = ["Pclass", "Sex", "SibSp", "Parch", "Cabin", "Embarked"] for i, feature_name in enumerate(view_feature): plt.subplot(2, 3, (i + 1)) # 按照属性Groupby,并计算生存均值(由于Survived为0、1,均值即表示生存概率) sns.barplot( x=feature_name, y="Survived", # hue="Survived", data=data_train[[feature_name, "Survived"]].groupby([feature_name], as_index=False).mean()) # 显示不同属性值下生存与未生还的柱状图组(个人感觉不是太直观) # sns.countplot(x=feature_name, hue="Survived", data=data_train) # 查看年龄和生存率的关系 data_train["Age_int"] = data_train["Age"].astype("int") plt.subplots(1, 1, figsize=(18, 4)) sns.barplot(x="Age_int", y="Survived", data=data_train[["Age_int", "Survived"]].groupby(["Age_int"], as_index=False).mean()) # plt.show()
mask = (DatosOrdenadosPorFecha_df['dateRep'] >= start_date) & (DatosOrdenadosPorFecha_df['dateRep'] <= end_date) fechasfiltradas_df = DatosOrdenadosPorFecha_df.loc[mask] #Creamos el dataframe con los datos que se van a utilizar en el informe grafico_df = fechasfiltradas_df[['dateRep', 'cases', 'moving14', 'moving7']] #Formateamos la fecha de nuevo a formato cadena para que se muestre correctamente en el gráfico grafico_df['dateRep'] = grafico_df['dateRep'].astype(str) #Dibujamos el gráfico fig, ax = plt.subplots(1, 1) grafico = sns.barplot(ax=ax, x="dateRep", y="cases", data=grafico_df, label="Nuevos Casos Diarios") ###Esto es para los índices de un gráfico catplot #grafico.set_titles("Nuevos Casos en España", fontsize=30) #grafico.set_xlabels("Fecha",fontsize=20) #grafico.set_ylabels("España",fontsize=20) #grafico.set_yticklabels(fontsize=10) #grafico.set_xticklabels(fontsize=5) #Gráfico de líneas media móvil de los últimos 14 días graficomv14 = sns.lineplot(ax=ax, x="dateRep", y="moving14", data=grafico_df,
#### # Make a bar graph fig1 = plt.figure(1) plt.bar(np.arange(4), mean_impf, yerr=sem_impf, ecolor='black', tick_label=['I', 'II', 'III', 'IV'], align='center') plt.ylabel('impact force (nM)') fig1.show() # Easier plot with Seaborn fig2 = plt.figure(2) sns.barplot(data=df, x='ID', y='impf') plt.xlabel('') plt.ylabel('impact force (mN)') fig2.show() ### # Message: do not make bar graphs. ### # Bee swarm plot fig3 = plt.figure(3) sns.swarmplot(data=df, x='ID', y='impf') plt.margins(0.02) plt.xlabel('') plt.ylabel('impact force (mN)') fig3.show()
svcscore=(model.score(X_test_array,y_test1))*100 clf=RandomForestClassifier(n_estimators=100) clf.fit(X_train_array,y_train) clf_pred=clf.predict(X_test_array) clfscore=(clf.score(X_test_array,y_test1))*100 knn = KNeighborsClassifier(n_neighbors = 11,metric='minkowski' , p=2).fit(X_train_array, y_train) knnscore=(knn.score(X_test_array,y_test1))*100 scores = [gnbscore,naivescore,svcscore,knnscore,dtscore,clfscore] algorithms = ["Gaussian naive bayes","Bernoulli naive bayes","Support Vector Machine","K-Nearest Neighbors","Decision Tree","Random Forest"] sns.set(rc={'figure.figsize':(15,8)}) plt.xlabel("Algorithms") plt.ylabel("Accuracy score") sns.barplot(algorithms,scores) final_model = naive_bayes # save the model to disk pickle.dump(final_model, open(save_model, 'wb')) def make_prediction(resumeNo): resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo+1) + '.pdf' loaded_model = pickle.load(open(save_model, 'rb')) loaded_vector = pickle.load(open(save_vector, 'rb')) resumeFile=open(resume,'rb') sample_resume=slate.PDF(resumeFile) sample_resume=sample_resume[0] sample_resume=loaded_vector.transform([sample_resume]) return loaded_model.predict(sample_resume)[0]
import codeacademylib3_seaborn import pandas as pd from matplotlib import pyplot as plt import seaborn as sns df = pd.read_csv('WorldCupMatches.csv') print(df.head()) df['Total Goals'] = df['Home Team Gaols'] + df['Away Team Goals'] print(df.head()) sns.set_style('whitegrid') sns.set_context('poster', font_size=10) f, ax = plt.subplots(figsize=(10, 25)) ax = sns.barplot(data=df, x=df['year'], y=df['total goals']) ax.set_title('Year vs. Av Goals') df_goals = pd.read_csv('goals.csv') #print(df_goals.head()) f, ax2 = sns.subplots(figsize=(12, 7)) ax2 = sns.set_context('notebook', font_scale=1.25) ax2 = sns.boxplot(data=d_goals, x='year', y='goals', palette='Spectral') ax2.set_title('Boxplot') plt.show()