def do_plot(flair_stats, filename): months = [] new_flaired = [] total = [] for month, month_stats in sorted(flair_stats.items()): total.append(sum(month_stats.values())) new_flaired.append(month_stats['new']) months.append(month) sns.set_style('whitegrid') total_plot_color = sns.xkcd_rgb["denim blue"] ignored_plot_color = sns.xkcd_rgb["orange red"] total_plot = sns.pointplot(x=months, y=total, color=total_plot_color) sns.pointplot(x=months, y=new_flaired, color=ignored_plot_color) total_patch = mpatches.Patch(color=total_plot_color) ignored_patch = mpatches.Patch(color=ignored_plot_color) total_plot.set(ylabel="Number of bugreports", xlabel="Month") total_plot.set_title('/r/bugs statistics by month:\nReddit admins consistently ignore half of bugreports', y=1.02) sns.plt.legend([total_patch, ignored_patch], ['Total number of bugreports', 'Number of ignored bugreports (submissions with "new" flair)'], loc="lower left") sns.plt.savefig(filename)
def plot_topk_all_models(): ''' Generate line plot that visualizes, for various values of k, the proportion of trials for which the model assigned the correct sketch category a rank of <= k. ''' Q = load_all_topk_predictions() krange = 64 ## how many values of k to plot sns.set_context('poster') fig = plt.figure(figsize=(8,8)) colors = [(0.2,0.2,0.2),(0.8,0.3,0.3),(0.3,0.3,0.8),(0.5,0.5,0.5),(0.6,0.2,0.6)] sns.pointplot(x='k', y='prop', hue='adaptor', data=Q, palette=colors, markers = '.', ci='sd', join=True) plt.ylabel('proportion',fontsize=24) plt.xlabel('k',fontsize=24) plt.title('% correct within top k') plt.ylim([0,1.1]) # plt.xlim([-0.1,krange]) plt.xlim([0,18]) # locs, labels = plt.xticks(np.linspace(0,krange-1,9),map(int,np.linspace(0,krange-1,9)+1),fontsize=16) plt.tight_layout() plt.legend(bbox_to_anchor=(1.0, 0.9))
def classifier_selection_time(data, filename): """ """ num_classifiers = data.shape[0] training_time = data.filter(regex="classifier|training time", axis=1) training_time.columns = ["classifier", "time"] training_time["Dataset"] = pd.Series(["Training"] * num_classifiers) testing_time = data.filter(regex="classifier|testing time", axis=1) testing_time.columns = ["classifier", "time"] testing_time["Dataset"] = pd.Series(["Testing"] * num_classifiers) time_data = pd.concat( (testing_time, training_time), ignore_index=True, axis=0) with sns.axes_style("ticks"): fig, ax = plt.subplots() sns.pointplot(x="classifier", y="time", hue="Dataset", data=time_data, palette="Set1", dodge=True, ax=ax) ax.set_xticklabels(ax.xaxis.get_ticklabels(), rotation=params.X_TICK_ROTATION) ax.set_xlabel("Classifier") ax.set_ylabel("Average Time (msec / example)") sns.despine() fig.set_size_inches(params.FIGSIZE) plt.tight_layout() plt.savefig(filename)
def classifier_selection_error(data, filename): """ """ num_classifiers = data.shape[0] data["training error"] = data["training error"].round(decimals=3) data["testing error"] = data["testing error"].round(decimals=3) training_error = data.filter(regex="classifier|training error", axis=1) training_error.columns = ["classifier", "error"] training_error["Dataset"] = pd.Series(["Training"] * num_classifiers) testing_error = data.filter(regex="classifier|testing error", axis=1) testing_error.columns = ["classifier", "error"] testing_error["Dataset"] = pd.Series(["Testing"] * num_classifiers) error_data = pd.concat( (testing_error, training_error), ignore_index=True, axis=0) with sns.axes_style("ticks"): fig, ax = plt.subplots() sns.pointplot(x="classifier", y="error", hue="Dataset", data=error_data, palette="Set1", dodge=True, ax=ax) ax.set_xticklabels(ax.xaxis.get_ticklabels(), rotation=params.X_TICK_ROTATION) ax.set_xlabel("Classifier") ax.set_ylabel("Mean Error") sns.despine() fig.set_size_inches(params.FIGSIZE) plt.tight_layout() plt.savefig(filename)
def skill_correlations(runs=50, n_clusters=5): results = [] clustering = kmeans for run in range(runs): for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]: for clustering in clusterings: for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]: answers, items = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation) true_cluster_names = list(items['concept'].unique()) X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) print(run, skill_correlation, clustering.__name__, students, '===', rand) if rand >= 0.9: results.append([students, clustering.__name__, rand, skill_correlation]) break results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation']) print(results) f, ax = plt.subplots(figsize=(7, 7)) ax.set(yscale="log") sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
def main(argv): # Lists of marker styles and line styles markers = 10 * ['o','^','x'] lines = 10 * ['-','--','-.'] infile = sys.argv[1] resframe = pd.read_csv(infile) print "Summary of all results found:" print resframe labels = map(int, resframe['Writers'].unique()) labels.sort() fig, ax = plt.subplots() sns.pointplot(x='Writers', y='Write Bandwidth (MiB/s)', data=resframe, hue='Scheme', scale=0.75, markers=markers, linestyles=lines) ax.set_ylim(ymin=0) plt.ylabel('Max. Write Bandwidth / MiB/s') plt.xlabel('Writers') plt.legend() plt.savefig('max_bandwidth_stats.png') plt.clf() sys.exit(0)
def cumm_plot(data, **kwargs): # data.sort('Opt1Value', inplace = True) # data['rank'] = data.sort('Opt1Value')['Opt1Value'].transform(lambda score: np.linspace(0, 1, len(score))) data['order'] = data.sort('Opt1Value').groupby(['new_type'])['Opt1Value'].transform(lambda score: np.linspace(0, 1, len(score))) data.sort('new_type', inplace = True) sb.pointplot('Opt1Value', 'order', data = data, hue = 'new_type', **kwargs)
def rolling_success_diff(answers, last_count=4, filters=None, only_last=True): if filters is None: filters = [None] data = [] for filter in filters: df = filter_users(answers, min_answer_count=filter) for df in df.groupby('user'): df = df[1] mean = df['correct'].mean() if len(df) < last_count: continue for x in df['correct'].rolling(last_count, last_count).mean(): if np.isnan(x): continue if not only_last: data.append([np.round(x - mean, 1), filter, 0]) if not only_last: data[-1][-1] = 1 else: data.append([x - mean, filter, 1]) df = pd.DataFrame(data, columns=['rolling_success_diff', 'min_answers', 'leave']) if not only_last: sns.pointplot(data=df, x='rolling_success_diff', y='leave', hue='min_answers').set(ylim=(0, 0.2)) else: for filter in filters: sns.distplot(df.loc[df['min_answers'] == filter, 'rolling_success_diff'], label=str(filter)) plt.legend(loc=1) return df
def _make_all_pred_plots(big_d, time_d, fig_name): fig, axarr = plt.subplots(2, 3, figsize=(11, 5)) axarr = np.ravel(axarr) letters = "abcdef" for i, (field, ax) in enumerate(zip(fields, axarr)): title = "({}) {}".format(letters[i], titles[i]) if i < 5: ax = sns.pointplot(x="n_vehicles", y=field, hue="predictions", data=big_d, palette=clrs, ax=ax) else: ax =sns.pointplot(x="vehicles", y=field, hue="predictions", data=time_d, palette=clrs, ax=ax) ax.set_xticklabels(vehicles) ax.set_title(title, fontsize=13, y=-0.45) ax.set_ylabel(prettify(field)) ax.set_xlabel("Number of Vehicles") if "%" in prettify(field): ax.set_ylim([0, 1]) vals = ax.get_yticks() ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in vals]) handles, _ = ax.get_legend_handles_labels() ax.legend().remove() fig.subplots_adjust(wspace=0.4, hspace=0.58) lgd = fig.legend( handles, fancy_preds, loc="lower center", fancybox=True, bbox_to_anchor=(0.46, 0.96), title="Number of Samples", markerscale=2.5, ncol=4) fig.savefig( fig_dir + fig_name, bbox_inches='tight', bbox_extra_artists=(lgd,)) plt.close()
def calibration_curve_plotter(y_test, prob_pos, n_bins=10): brier = brier_score_loss(y_test, prob_pos, pos_label=1) fig = plt.figure(0, figsize=(10, 10)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) df = pd.DataFrame({"true": y_test}) bins = np.linspace(0.0, 1.0, n_bins + 1) binids = np.digitize(prob_pos, bins) - 1 df["Bin center"] = bins[binids] + 0.5 / n_bins df[""] = "Model calibration: (%1.5f)" % brier o = bins + 0.5 / n_bins df2 = pd.DataFrame({"true": o, "Bin center": o}) df2[""] = "Perfect calibration" df = pd.concat([df, df2]) sns.pointplot(x="Bin center", y="true", data=df, order=o, hue="", ax=ax1) ax2.hist(prob_pos, range=(0, 1), bins=10, label="Model", histtype="step", lw=2) ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) # ax1.legend(loc="lower right") ax1.set_title("Calibration plots") ax2.set_xlabel("Predicted Probability") ax2.set_ylabel("Count") plt.tight_layout()
def make_interval_comp_plots(df): fig, ax = plt.subplots(1, 1, figsize=aux_fig_size) sns.pointplot(x="interval", y="comp_time", data=df, ax=ax, color=sns.xkcd_rgb["bright red"]) ax.set_xlabel("Step Size [s]") ax.set_ylabel(prettify("comp_time")) ax.set_xticklabels(intervals) plt.savefig("figs/interval-comp_time.png", bbox_inches="tight") plt.close()
def test_activity_timecourse_with_inlay(): import pandas as pd import matplotlib.pyplot as plt import samri.plotting.maps as maps import seaborn as sns from os import path # Style elements palette=["#56B4E9", "#E69F00"] data_dir = path.join(path.dirname(path.realpath(__file__)),"../../tests/data") data_path = path.join(data_dir,'drs_activity.csv') df = pd.read_csv(data_path) df = df.rename(columns={'t':'Mean t-Statistic'}) df['Session']=df['Session'].map({ 'ofM':'naïve', 'ofMaF':'acute', 'ofMcF1':'chronic/2w', 'ofMcF2':'chronic/4w', 'ofMpF':'post', }) # definitions for the axes left, width = 0.06, 0.9 bottom, height = 0.06, 0.9 session_coordinates = [left, bottom, width, height] roi_coordinates = [left+0.02, bottom+0.7, 0.3, 0.2] fig = plt.figure(1) ax1 = plt.axes(session_coordinates) sns.pointplot( x='Session', y='Mean t-Statistic', units='subject', data=df, hue='treatment', dodge=True, palette=palette, order=['naïve','acute','chronic/2w','chronic/4w','post'], ax=ax1, ci=95, ) ax2 = plt.axes(roi_coordinates) maps.atlas_label("/usr/share/mouse-brain-atlases/dsurqec_200micron_roi-dr.nii", scale=0.3, color="#E69F00", ax=ax2, annotate=False, alpha=0.8, ) plt.savefig('_activity_timecourse_with_inlay.png')
def plot(self): """ Graphical summary of pointwise pareto-k importance-sampling indices Pareto-k tail indices are plotted (on the y axis) for each observation unit (on the x axis) """ seaborn.pointplot( y = self.pointwise.pareto_k, x = self.pointwise.index, join = False)
def make_demand_comp_plots(df): fig, ax = plt.subplots(1, 1, figsize=aux_fig_size) sns.pointplot(x="demand", y="comp_time", hue="capacity", data=df, ax=ax, palette=dem_clrs) ax.set_ylabel(prettify("comp_time")) ax.set_xlabel("Nominal Number of Requests") ax.set_xticklabels(["x0.5", "x1", "x2"]) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, [1, 4], title="Capacity") plt.savefig("figs/demand-comp_time.png", bbox_inches="tight") plt.close()
def individual_graph(transparencies_1, transparencies_2, condition, subject_number, display_graph=True): x = [i for i in range(1, 81)] sns.pointplot(x, transparencies_1, color='red') plot = sns.pointplot(x, transparencies_2) plot.set(xlabel="Trial", ylabel="Contrast", title="{} Condition".format(condition)) if display_graph: plt.show() plot = plot.get_figure() plot.savefig("Subject {}/{}.png".format(subject_number, condition)) plt.cla()
def plotexp(res): data = res['res']['data'] _, basename = os.path.split(res['args']['filename']) basename = basename.replace('.', '') plt.figure(facecolor='white', tight_layout=True, figsize=(4.5, 3.5), dpi=300) sns.pointplot(x='Number of topics', y='Log evidence', data=data) plt.savefig(os.path.join(DATA_DIR, basename + '_nt_evidence.png'), dpi=300) plt.figure(facecolor='white', tight_layout=True, figsize=(4.5, 3.5), dpi=300) sns.pointplot(x='Number of topics', y='Runtime', data=data) plt.savefig(os.path.join(DATA_DIR, basename + '_nt_runtime.png'), dpi=300)
def make_avg_plots_with_preds(big_d): d = big_d.query("capacity == 4 and waiting_time == 300") cap, wt = 4, 300 fig = plt.figure() fig.set_size_inches(13, 10) for field in fields + ["n_shared_per_passenger"]: ax = sns.pointplot(x="vehicles", y=field, hue="predictions", data=d) ax.set_xticklabels(vehicles) plt.ylabel(prettify(field)) if "%" in prettify(field): ax.set_ylim([0, 1]) vals = ax.get_yticks() ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in vals]) plt.xlabel("Num Vehicles") handles, _ = ax.get_legend_handles_labels() plt.legend( handles, ["No R.B.", 0, 100, 200, 300, 400], loc="center left", fancybox=True, shadow=True, bbox_to_anchor=(1, 0.5), title="Predictions", markerscale=3) plt.title(make_pred_title(wt, cap)) plt.savefig( "figs/avg-with-preds-{}.png".format(field), bbox_inches='tight') plt.close()
def _plotWeekdayByMonthStats(stats, stat_name): dataToPlot = _prepareWeekdayByMonthStats(stats) # Plot g = sns.pointplot(data=dataToPlot, x="day", y=stat_name, hue="month", order=dayOfWeekOrder) g.set(xlabel='') g.set_ylabel(NAMES[stat_name]) return g
def draw_graph(plot_data, rankingSystem, numberOfUv, hue): plot_data['world_rank'] = plot_data['world_rank'].astype(int) ax = sns.pointplot(x='year', y='world_rank', hue=hue, data=plot_data); pylab.title("Top " + str(numberOfUv) + " university by " + rankingSystem, fontsize=26) pylab.xticks(fontsize=20) pylab.yticks(fontsize=20) pylab.ylabel("World Rank", fontsize=26) pylab.xlabel("Year", fontsize=26) pylab.savefig('resources/images/topuv.png') pylab.cla() pylab.clf() pylab.close()
def compare_groups(df, x, mpu = False, order = None, hue = None, plot = True, table = True): agg = 'pred_aggression_score' rec = 'pred_recipient_score' if table: if hue: print(df.groupby([x, hue])[agg, rec].mean()) else: print(df.groupby([x])[agg, rec].mean()) if plot: fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize = (12,6)) if mpu: cols = ['user_text', x] plt.figure() sns.pointplot(x=x, y= agg, data=mpg(df, agg, cols) , order = order, hue = hue, ax = ax1) plt.figure() sns.pointplot(x=x, y= rec, data=mpg(df, rec, cols) , order = order, hue = hue, ax = ax2) else: ax = sns.pointplot(x=x, y= agg, data=df, order = order, hue = hue, ax = ax1) plt.figure() ax = sns.pointplot(x=x, y= rec, data=df, order = order, hue = hue, ax = ax2)
def qualitative_times(df, ax=None, x="relative_date", y="weight", unit="Animal_id", condition="treatment", err_style="unit_traces", order=None, bp_style=True, save_as='', legend_title=False, palette=QUALITATIVE_COLORSET, renames={}, model='', print_model=False, print_anova=False, anova_type=3, groups=None, ci=95, ): """Plot a timecourse based on qualitative times (e.g. sessions). """ if bp_style: plt.style.use(u'seaborn-darkgrid') plt.style.use('ggplot') if renames: for key in renames: for subkey in renames[key]: df.loc[df[key] == subkey, key] = renames[key][subkey] ax = sns.pointplot( x=x, y=y, units=unit, data=df, hue=condition, dodge=True, palette=sns.color_palette(palette), order=order, ax=ax, ci=ci, ) ax.set_ylabel(y) if not legend_title: legend_title = ax.legend().set_title('') if save_as: plt.savefig(path.abspath(path.expanduser(save_as)), bbox_inches='tight')
def line_plot(data ,title = "", x_title ="", y_title="", legend_label="",group_labels=None): plot_data = DataFrame() plot_data['x'] = data[:, 1].astype(int) plot_data['y'] = data[:, 0].astype(float) plot_data[legend_label] = data[:, 2] sns.set(style="whitegrid") g = sns.pointplot(x="x", y="y", hue=legend_label, data=plot_data, hue_order=np.unique(plot_data[legend_label])) plt.title(title, fontsize=25) plt.ylabel(y_title, fontsize=12) plt.xlabel(x_title, fontsize=12) plt.show()
def students(runs=15): results = [] for run in range(runs): # for n_students in range(100, 1001, 100): # for n_students in [10, 25, 50, 100, 200, 300, 400, 600]: for difficulty_shift in np.arange(-1, 1.1, 0.2): answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing) true_cluster_names = list(items['concept'].unique()) # for i, clustering in enumerate(clusterings): for similarity, euclid, similarity_name in similarities: X = similarity(answers) items_ids = X.index ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids]) labels = clustering(X, n_clusters, euclid=euclid) rand = rand_index(ground_truth, labels) results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name]) print(run, n_students, similarity_name, rand) results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity']) print(results) plt.figure(figsize=(16, 24)) sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
def make_interval_plots(df): for field in tqdm(fields + ["n_shared_per_passenger"]): fig, ax = plt.subplots(1, 1, figsize=aux_fig_size) ax = sns.pointplot(x="interval", y=field, data=df, color=sns.xkcd_rgb["bright red"], ax=ax) filename = "figs/interval-{}.png".format(field) ax.set_xlabel("Step Size [s]") ax.set_ylabel(prettify(field)) if "%" in prettify(field): ax.set_ylim(0, 1) vals = ax.get_yticks() yticklabels = ['{:3.0f}%'.format(x * 100) for x in vals] ax.set_yticklabels(yticklabels) plt.savefig(filename, bbox_inches="tight") plt.close()
def process_graph(university): timesData = Data.get_time_data() shanghaiData = Data.get_shanghai_data() cwurData = Data.get_cwur_data() # university_name = [] # university_name = university times_plot_data = timesData[timesData.university_name.isin(university)][['world_rank', 'year']] shanghai_plot_data = shanghaiData[shanghaiData.university_name.isin(university)][['world_rank', 'year']] cwur_plot_data = cwurData[cwurData.institution.isin(university)][['world_rank', 'year']] times_plot_data['source'] = 'Times' shanghai_plot_data['source'] = 'Shanghai' cwur_plot_data['source'] = 'CWUR' ## parse the first number in rank for data ranges times_plot_data['world_rank'] = times_plot_data['world_rank'].str.split('-').str[0] shanghai_plot_data['world_rank'] = shanghai_plot_data['world_rank'].str.split('-').str[0] plot_data = times_plot_data.append(shanghai_plot_data).append(cwur_plot_data) plot_data['world_rank'] = plot_data['world_rank'].astype(int) sns.set(style="ticks", color_codes=True) plt.rcParams['figure.figsize'] = 16, 12 ax = sns.pointplot(x='year', y='world_rank', hue='source', data=plot_data); # Styling plt.title(university[0] + " Ranking", fontsize=26) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.ylabel("World Rank", fontsize=26) plt.xlabel("Year", fontsize=26) plt.tight_layout() plt.legend(loc='upper left', fontsize=20) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() # Save File plt.savefig('resources/images/university.png') plt.cla() plt.clf() plt.close()
def compare2(y, y_hat1, y_hat2): thetas = np.linspace(0, 1, num=101) maes = [] for theta in thetas: blended = blend2([y_hat1, y_hat2], [theta, 1 - theta]) err = mae(y, blended) maes.append(err) maes = np.array(maes) min_i = np.argmin(maes) d = {'mae': maes, 'theta': thetas} df = pd.DataFrame(d) sns.set_style('darkgrid') ax = sns.pointplot(x='theta', y='mae', data=df) ax.set_xticks([]) sns.plt.axvline(x=min_i) lab = 'BEST (theta: ' + str(thetas[min_i]) + ', MAE: ' + str(maes[min_i]) + ')' ax.set(xlabel=lab, ylabel='MAE') sns.plt.show()
def make_demand_plots(df): for field in tqdm(fields + ["n_shared_per_passenger"]): fig, ax = plt.subplots(1, 1, figsize=aux_fig_size) ax = sns.pointplot(x="demand", y=field, hue="capacity", data=df, palette=dem_clrs, ax=ax) ax.set_xlabel("Nominal Number of Requests") ax.set_ylabel(prettify(field)) ax.set_xticklabels(["x0.5", "x1", "x2"]) if "%" in prettify(field): ax.set_ylim(0, 1) vals = ax.get_yticks() yticklabels = ['{:3.0f}%'.format(x * 100) for x in vals] ax.set_yticklabels(yticklabels) handles, _ = ax.get_legend_handles_labels() ax.legend(handles, [1, 4], title="Capacity") filename = "figs/demand-{}.png".format(field) plt.savefig(filename, bbox_inches="tight") plt.close()
def compare_multiway(y, y_hat): thetas = np.linspace(0, 1, num=101) maes = [] for theta in thetas: theta2 = (1 - theta) / (len(y_hat) - 1) blended = blend(y_hat, [theta] + list(np.repeat(theta2, len(y_hat) - 1))) err = mae(y, blended) maes.append(err) maes = np.array(maes) min_i = np.argmin(maes) d = {'mae': maes, 'theta': thetas} df = pd.DataFrame(d) sns.set_style('darkgrid') ax = sns.pointplot(x='theta', y='mae', data=df) ax.set_xticks([]) sns.plt.axvline(x=min_i) lab = 'BEST (theta: ' + str(thetas[min_i]) + ', MAE: ' + str(maes[min_i]) + ')' ax.set(xlabel=lab, ylabel='MAE') sns.plt.show()
def save_solvers_cmp(is_power_point = False): dfs = [] for filename in glob.glob("../output/cifar10classifier_resnet32_*.csv"): target = filename.split("_")[-1].split(".csv")[0] if target in ["adadelta", "adagrad", "adam", "momentum", "rmsprop"]: df = pd.read_csv(filename) df["train_error"] = 1 - df["train_accuracy"] df["test_error"] = 1 - df["test_accuracy"] dfs.append(df) total_df = pd.concat(dfs) total_df["name"] = total_df["name"].str.split("_").str.get(-1).str.replace("Momentum", "Nesterov(Original Paper)") ax = sns.pointplot(x="epoch", y="test_error", hue="name", data=total_df, scale=0.2) if is_power_point: ax.legend(loc="lower left", markerscale=9.0, fontsize=20) else: ax.legend(loc="lower left", markerscale=3.0) ax.set(ylim=(0, 0.2)) ax.set_xticklabels([i if i % 10 == 0 else "" for i in range(200)]) ax.set(xlabel='epoch', ylabel='error(%)') ax.get_figure().savefig("../figures/resnet.solvers.png") sns.plt.close()
def twentyMins(ultilive, nattys): # create array for every 20 minutes from 10-16 00:00 to 10-20 00:00 ulti_twentyMins = np.zeros(288) for u in ultilive.iterrows(): day = (u[1]['day'] - 16)*72 hour = (u[1]['hour'])*3 min = (u[1]['minute'])%3 ulti_twentyMins[day+hour+min]+=1 # create array for every 20 minutes from 10-16 00:00 to 10-20 00:00 natty_twentyMins = np.zeros(288) for u in nattys.iterrows(): day = (u[1]['day'] - 16)*72 hour = (u[1]['hour'])*3 min = (u[1]['minute'])%3 natty_twentyMins[day+hour+min]+=1 sns.set(style='darkgrid', context='poster') plt.figure(figsize=(20,15)) dfTime = pd.DataFrame(index = np.arange(0,576), columns = ['Time', 'Tweets', 'Source']) dfTime.Time = np.concatenate([range(0,288),range(0,288)]) dfTime.Tweets = np.concatenate([ulti_twentyMins, natty_twentyMins]) dfTime.Source = np.concatenate([["Ultiworld Live"]*288, ["#NationalsTX"]*288]) s = sns.pointplot('Time', 'Tweets', 'Source', data=dfTime, palette="Paired") s.set_title("Tweets per Twenty Minutes") s.set_xticks([72, 144, 216, 287]) s.set_xticklabels(['17th', '18th', '19th', '20th']) s.set_xlabel("Day") s.axis([0,288,0,50]) s.figure.savefig("ByTwenty.png")
# Here we plot the results. We first make a pointplot with the average # performance of each pipeline across session and subjects. fig, axes = plt.subplots(1, 2, figsize=[8, 4], sharey=True) sns.stripplot( data=results, y="score", x="pipeline", ax=axes[0], jitter=True, alpha=0.5, zorder=1, palette="Set1", ) sns.pointplot(data=results, y="score", x="pipeline", ax=axes[0], zorder=1, palette="Set1") axes[0].set_ylabel("ROC AUC") axes[0].set_ylim(0.5, 1) ############################################################################## # The second plot is a paired scatter plot. Each point representing the score # of a single session. An algorithm will outperform another is most of the # points are in its quadrant. paired = results.pivot_table( values="score", columns="pipeline", index=["subject", "session"] ) paired = paired.reset_index() sns.regplot(data=paired, y="RG+LR", x="CSP+LDA", ax=axes[1], fit_reg=False)
def plot_performance_graph(metric='AUROC', evaluation_set='dev', title="", file_name="", data=None, color_map=None): """ Plot the graphs onto a multi-subplot grid using seaborn Args: metric - the metric to plot for the y axis evaluation_set - whehter to plot the dev set or test set title - the main title of the large graph file_name - the name of the file to save the graph data - the dataframe tree to plot the large graph color_map - the color coded to plot each point on """ fig, axes = plt.subplots(len(file_tree), len(file_tree["DaG"]), figsize=(25, 15), sharey='row') for row_ind, col in enumerate(data): for col_ind, row in enumerate(data[col]): if metric == "AUROC": axes[row_ind][col_ind].set_ylim([0.5, 1]) if metric == "AUPR": axes[row_ind][col_ind].set_ylim([0, 0.7]) # Data Not Available Yet if len(data[col][row]) == 0: lower, upper = axes[row_ind][col_ind].get_ylim() axes[row_ind][col_ind].annotate("Coming Soon!!", (0.2, (lower + upper) / 2), color="red", fontsize=20) else: sns.pointplot(x="num_lfs", y=metric if metric == "AUROC" else "AUPRC", data=data[col][row][evaluation_set], ax=axes[row_ind][col_ind], hue="label", ci="sd", scale=1.2, markers=["^", "o"]) # remove x axis labels axes[row_ind][col_ind].set_xlabel('') axes[row_ind][col_ind].get_legend().remove() # unstable code # if order of error bars # change then this code will not work for idx, item in enumerate( axes[row_ind][col_ind].get_children()): # if the points in graph # change color map accordingly if idx == 0 or idx == 1: item.set_edgecolor([ color_map[col] if index == 0 else color_map[row] for index in range( len(data[col][row] [evaluation_set].num_lfs.unique())) ]) item.set_facecolor([ color_map[col] if index == 0 else color_map[row] for index in range( len(data[col][row] [evaluation_set].num_lfs.unique())) ]) #if error bars change accordingly elif isinstance(item, plt.Line2D): if idx == 2: item.set_linestyle('dashed') item.set_color("black") item.set_alpha(0.25) elif idx == 9: item.set_linestyle('dashed') item.set_color("black") item.set_alpha(0.25) else: item.set_color(color_map[row]) # only set first column and first row titles if row_ind == 0: axes[row_ind][col_ind].set_title(row, color=color_map[row]) if col_ind == 0: axes[row_ind][col_ind].set_ylabel(col, color=color_map[col]) else: axes[row_ind][col_ind].set_ylabel('') for item in axes.flat: item.title.set_fontsize(30) item.yaxis.label.set_fontsize(24) item.xaxis.label.set_fontsize(24) for tick in item.get_yticklabels() + item.get_xticklabels(): tick.set_fontsize(23) if "label" in data["DaG"]["DaG"]["dev"].columns: axes.flatten()[3].legend(loc='upper center', bbox_to_anchor=(2.54, 0.8), fontsize=20) leg = axes.flatten()[3].get_legend() leg.legendHandles[0].set_edgecolor('black') leg.legendHandles[0].set_facecolor('white') leg.legendHandles[1].set_edgecolor('black') leg.legendHandles[1].set_facecolor('white') fig.text(0.5, 0.89, 'Label Sources', ha='center', fontsize=30) fig.text(0.5, 0.04, 'Number of Additional Label Functions', ha='center', fontsize=30) fig.text(0.04, 0.5, f'Predicted Relations ({metric})', va='center', rotation='vertical', fontsize=25) fig.suptitle(title, fontsize=30) fig.text(0.69, 0.02, '0-Only Uses Relation Specific Databases.', fontsize=27) plt.subplots_adjust(top=0.85) plt.savefig(file_name, format='png')
def plot_roi_per_session( df, x='Session', y='Mean t-Statistic', condition='treatment', unit='subject', ci=90, palette=["#56B4E9", "#E69F00"], dodge=True, order=[], feature_map=True, roi_left=0.02, roi_bottom=0.74, roi_width=0.3, roi_height=0.2, roi_anat='/usr/share/mouse-brain-atlases/dsurqec_40micron_masked.nii', roi_threshold=None, cut_coords=None, samri_style=True, renames=[], save_as='', ax=None, fig=None, ): """Plot a ROI t-values over the session timecourse """ if samri_style: plt.style.use(u'seaborn-darkgrid') plt.style.use('ggplot') try: df = path.abspath(path.expanduser(df)) except AttributeError: pass # definitions for the axes height = rcParams['figure.subplot.top'] bottom = rcParams['figure.subplot.bottom'] left = rcParams['figure.subplot.left'] width = rcParams['figure.subplot.right'] session_coordinates = [left, bottom, width, height] roi_coordinates = [ left + roi_left, bottom + roi_bottom, roi_width, roi_height ] if not fig: fig = plt.figure(1) if renames: for key in renames: for subkey in renames[key]: df.loc[df[key] == subkey, key] = renames[key][subkey] if not ax: ax1 = plt.axes(session_coordinates) else: ax1 = ax ax = sns.pointplot( x=x, y=y, units=unit, data=df, hue=condition, dodge=dodge, palette=sns.color_palette(palette), order=order, ax=ax1, ci=ci, ) ax.set_ylabel(y) if isinstance(feature_map, str): ax2 = plt.axes(roi_coordinates) if roi_threshold and cut_coords: maps.stat( feature, cut_coords=cut_coords, template=roi_anat, annotate=False, scale=0.3, show_plot=False, interpolation=None, threshold=roi_threshold, draw_colorbar=False, ax=ax2, ) else: maps.atlas_label( feature_map, scale=0.3, color="#E69F00", ax=ax2, annotate=False, alpha=0.8, ) elif feature_map: try: features = df['feature'].unique() except KeyError: pass else: if len(features) > 1: print( 'WARNING: The features list contains more than one feature. We will highlight the first one in the list. This may be incorrect.' ) feature = features[0] ax2 = plt.axes(roi_coordinates) if path.isfile(feature): if roi_threshold and cut_coords: maps.stat( stat_maps=feature, cut_coords=cut_coords, template=roi_anat, annotate=False, scale=0.3, show_plot=False, interpolation=None, threshold=roi_threshold, draw_colorbar=False, ax=ax2, ) else: maps.atlas_label( feature, scale=0.3, color="#E69F00", ax=ax2, annotate=False, alpha=0.8, ) else: atlas = df['atlas'].unique()[0] mapping = df['mapping'].unique()[0] if isinstance(feature, str): feature = [feature] maps.atlas_label( atlas, scale=0.3, color="#E69F00", ax=ax2, mapping=mapping, label_names=feature, alpha=0.8, annotate=False, ) if save_as: plt.savefig(path.abspath(path.expanduser(save_as)), bbox_inches='tight') return fig, ax
# Show each observation with a scatterplot sns.stripplot(x="measurement", y="value", hue="species", data=iris, dodge=True, jitter=True, alpha=.25, zorder=1) # Show the conditional means sns.pointplot(x="measurement", y="value", hue="species", data=iris, dodge=.532, join=False, palette="dark", markers="d", scale=.75, ci=None) # Improve the legend handles, labels = ax.get_legend_handles_labels() ax.legend(handles[3:], labels[3:], title="species", handletextpad=0, columnspacing=1, loc="lower right", ncol=3, frameon=True)
import seaborn as sb from matplotlib import pyplot as plt df = sb.load_dataset('titanic') sb.pointplot(x="sex", y="survived", hue="class", data=df) plt.show()
############################################################################## # Plot Results # ------------ # # Here we plot the results. fig, ax = plt.subplots(facecolor="white", figsize=[8, 4]) n_subs = len(dataset.subject_list) if n_subs > 1: r = results.groupby(["pipeline", "subject", "data_size"]).mean().reset_index() else: r = results sns.pointplot(data=r, x="data_size", y="score", hue="pipeline", ax=ax, palette="Set1") errbar_meaning = "subjects" if n_subs > 1 else "permutations" title_str = f"Errorbar shows Mean-CI across {errbar_meaning}" ax.set_xlabel("Amount of training samples") ax.set_ylabel("ROC AUC") ax.set_title(title_str) fig.tight_layout() plt.show()
for time in times: data_dicts.append({ "Number of samples": num_samples, "Convergence time (s)": time, "Inference method": "DP/VI" }) for num_samples, times in SciClone_times.iteritems(): for time in times: data_dicts.append({ "Number of samples": num_samples, "Convergence time (s)": time, "Inference method": "SciClone (VI)" }) for num_samples, times in PyClone_times.iteritems(): for time in times: data_dicts.append({ "Number of samples": num_samples, "Convergence time (s)": time, "Inference method": "PyClone (DP/MCMC)" }) data = pd.DataFrame(data_dicts) ax = sns.pointplot(x="Number of samples", y="Convergence time (s)", hue="Inference method", data=data, capsize=0.1, markers=['x', 'o', '^'], linestyles=['--', '--', '--']) sns.plt.savefig('time_comparisons.png')
df_M.head(2) # <a id='eda'></a> # ## Exploratory Data Analysis # # > After trimmed and cleaned The data, Now move on to exploration. Compute statistics and create visualizations to find Patterns between the data , find answers of My research questions . # ### Research Question 1 (Top 20 movies based on its Profit) # In[51]: info = pd.DataFrame(df_M['revenue'].sort_values(ascending=False)) info['original_title'] = df_M['original_title'] data = list(map(str, (info['original_title']))) x = list(data[:20]) y = list(info['revenue'][:20]) ax = sns.pointplot(x=y, y=x) sns.set(rc={'figure.figsize': (10, 10)}) ax.set_title("Top 20 Movies has high Profit", fontsize=15) ax.set_xlabel("revenue", fontsize=15) sns.set_style("darkgrid") # The Avatar Movie has the highst provit in the dataset # ### Research Question 2 (Which movie Has Highest / Lowest Profit and budget) # In[52]: #calculate Profit for each of the movie #add a new column Profit for each of the movie df_M['Profit'] = df_M['revenue'] - df_M['budget']
# Produces Pandas Series plant_growth.groupby('group')['weight'].mean() #%% # Produces Pandas DataFrame plant_growth.groupby('group')[['weight']].mean() #%% # Easy and flexible plant_growth.groupby(['group']).agg({'weight': ['mean', 'std']}) #%% # plot the data: sns.boxplot(x="group", y="weight", data=plant_growth) sns.catplot(x="group", y="weight", data=plant_growth) sns.pointplot(x="group", y="weight", data=plant_growth, join=False) sns.catplot(x="group", y="weight", data=plant_growth, kind="point") #%% # base R plotting functions: """ boxplot() hist() plot() plot(density()) """ #%% # specify the model """ import statsmodels.api as sm from statsmodels.formula.api import ols """ model = ols("weight ~ group", data=plant_growth) results = model.fit()
# In[6]: train, test = pd.read_csv("../input/train.csv"), pd.read_csv( "../input/test.csv") test_ids = test[["PassengerId"]] # In[7]: train.head() # We will plot various features with their relation to survival rate to have an idea of correlations # In[8]: fig, axs = plt.subplots(ncols=3, figsize=(16, 5)) sns.pointplot(x="Embarked", y="Survived", hue="Sex", data=train, ax=axs[0]) sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train, ax=axs[1]) sns.violinplot(x="Survived", y="Age", hue="Sex", data=train, ax=axs[2]) # We can already see some (strong) correlation between sex, age, Pclass, embarked and survival rate # In[9]: data_age = [ train[train.Survived == 1].Age.dropna(), train[train.Survived == 0].Age.dropna() ] labels = ["Survived", "Not survived"] fig = ff.create_distplot(data_age, labels, bin_size=2, show_rug=False) pyo.iplot(fig)
'Number of articles': articles_count, 'Recall std error': Micro_Recall_Values_std_error }) # stop after certain number of articles if (articles_count > max_count): break # visualize Recall values Articles_Recall_Values.boxplot() # visualize standard errors values Recall_std_errors = pd.DataFrame(Macro_Recall_st_errors) Recall_std_errors['Recall Type'] = 'Macro Recall' temp_df = pd.DataFrame(Micro_Recall_st_errors) temp_df['Recall Type'] = 'Micro Recall' Recall_std_errors = Recall_std_errors.append(temp_df) plt.figure(figsize=(8, 6)) fig = sns.pointplot(x='Number of articles', y='Recall std error', hue='Recall Type', data=Recall_std_errors, col='Recall Type') fig.set(xlabel="Number of Articles") fig.set(ylabel="Recall Standard Error")
# just by eyeballing, normal distrib does not capture tails # nbinom seems like best description, but params are not very intuitive... import scipy.stats as stats fig, axes = plt.subplots(len(strains2plot)) fit = [] for ax, (strain, group) in zip(axes.ravel(), parts[(parts.mass_norm>=2)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))].groupby('strain')): values = group.mass_norm.values fit.append((strain, stats.probplot(values, dist="geom", sparams=(0.3), plot=ax)[1])) # Summary plot df2plot = parts[(parts.mass_norm>=8)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))] order = sorted(df2plot.CTDr.unique()) with sns.axes_style(*style): fig, ax = plt.subplots(figsize=(9, 6)) # pooled sns.pointplot(x='strain', y='mass_norm', order=strains2plot, data=df2plot, join=False, estimator=np.median, ci=99, ax=ax) # each image sns.stripplot(x='strain', y='mass_norm', order=strains2plot, data=parts[(parts.mass_norm>=7)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))].groupby(['mov_name', 'strain']).median().reset_index(), ax=ax, size=12, alpha=0.25) ax.set(ylabel='Median TS intensity (a.u.)', xlabel='CTD repeats', ylim=(ax.get_yticks()[0]-0.2, ax.get_yticks()[-1])) sns.despine(left=False, bottom=False) plt.tight_layout() # check peaks found from skimage import io import trackpy as tp imname = '03052019_yQC21_255u100%int480_150msExp_30-45minPosGal_6_w2GFPlow' imname = '03272019_TL47pQC99_255u100%int480_150msExp_30-50minPosGal_13_w2GFPlow'
#设置轴的间隔 #ax为两条坐标轴的实例 ax = plt.gca() #把x轴的刻度间隔设置为5,并存在变量里 x_major_locator = MultipleLocator(1) #把x轴的主刻度设置为5的倍数 ax.xaxis.set_major_locator(x_major_locator) sns.set_context(context="poster", font_scale=0.1) # plt.bar(np.arange(len(df["high"].value_counts()))+0.5,df["high"].value_counts(),width=0.8) plt.plot(df["date"], df["high"], c='green') for x, y in zip(np.arange(len(df["date"].value_counts())), df["high"].value_counts()): plt.text(x, y, y, ha="center", va="bottom") plt.savefig("weather.jpg") plt.show() sns.pointplot(df["date"], df["high"]) sns.set_context(context="poster", font_scale=0.1) plt.savefig("2.jpg") plt.show() line.add("气温", df["date"], df["high"], ymbol_size=2, is_step=False, is_label_show=df['date']) line.render("zhexian.html") #生成对于的HTML文件 # print(df_new.head(10))
# <li>linestyles : string or list of strings, optional</li> # <li>color : matplotlib color, optional</li> # <li>palette : palette name, list, or dict, optional</li> # <li>ax : matplotlib Axes, optional</li> # </ul> # # # In[ ]: #Gender show point plot data['Race/Ethnicity'].unique() len(data[(data['Race/Ethnicity'] == 'group B')].Math_Score) f, ax1 = plt.subplots(figsize=(25, 10)) sns.pointplot(x=np.arange(1, 191), y=data[(data['Race/Ethnicity'] == 'group B')].Math_Score, color='lime', alpha=0.8) sns.pointplot(x=np.arange(1, 191), y=data[(data['Race/Ethnicity'] == 'group B')].Reading_Score, color='red', alpha=0.5) #sns.pointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='lime',alpha=0.8) plt.xlabel('Group B index State') plt.ylabel('Frequency') plt.title('Group B Math Score & Reading_Score') plt.xticks(rotation=90) plt.grid() plt.show() # In[ ]:
#graph individual features by survival fig, saxis = plt.subplots(2, 3, figsize=(16, 12)) sns.barplot(x='Embarked', y='Survived', data=data1, ax=saxis[0, 0]) sns.barplot(x='Pclass', y='Survived', order=[1, 2, 3], data=data1, ax=saxis[0, 1]) sns.barplot(x='Isalone', y='Survived', order=[1, 0], data=data1, ax=saxis[0, 2]) sns.pointplot(x='FareBin', y='Survived', data=data1, ax=saxis[1, 0]) sns.pointplot(x='AgeBin', y='Survived', data=data1, ax=saxis[1, 1]) sns.pointplot(x='FamilyMembers', y='Survived', data=data1, ax=saxis[1, 2]) # In[ ]: #graph distribution of qualitative data: Pclass #we know class mattered in survival, now let's compare class and a 2nd feature fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize=(14, 12)) sns.boxplot(x='Pclass', y='Fare', hue='Survived', data=data1, ax=axis1) axis1.set_title('Pclass vs Fare Survival Comparison') sns.violinplot(x='Pclass', y='Age', hue='Survived',
part.loc[:,'M'].plot() plt.title('Variation of Male Athletes over time') part = WomenOverTime.groupby('Year')['Sex'].value_counts() plt.figure(figsize=(20, 10)) part.loc[:,'F'].plot() plt.title('Variation of Female Athletes over time') plt.figure(figsize=(20, 10)) sns.boxplot('Year', 'Age', data=MenOverTime) plt.title('Variation of Age for Male Athletes over time') MenOverTime.loc[MenOverTime['Age'] > 80].head(10) plt.figure(figsize=(20, 10)) sns.boxplot('Year', 'Age', data=WomenOverTime) plt.title('Variation of Age for Female Athletes over time') WomenOverTime.loc[WomenOverTime['Year'] == 1904] plt.figure(figsize=(20, 10)) sns.pointplot('Year', 'Weight', data=MenOverTime) plt.title('Variation of Weight for Male Athletes over time') plt.figure(figsize=(20, 10)) sns.pointplot('Year', 'Weight', data=WomenOverTime) plt.title('Variation of Weight for Female Athletes over time') womenInOlympics.loc[womenInOlympics['Year'] < 1924].head(20) plt.figure(figsize=(20, 10)) sns.pointplot('Year', 'Height', data=MenOverTime, palette='Set2') plt.title('Variation of Height for Male Athletes over time') plt.figure(figsize=(20, 10)) sns.pointplot('Year', 'Height', data=WomenOverTime, palette='Set2') plt.title('Variation of Height for Female Athletes over time') WomenOverTime.loc[(WomenOverTime['Year'] > 1924) & (WomenOverTime['Year'] < 1952)].head(10) MenOverTime.head(5) itMenOverTime = MenOverTime.loc[MenOverTime['region'] == 'Italy'] itMenOverTime.head(5)
means = dataTarLoc.groupby(['cond_tarLocation'])['responseTime'].mean().values dataTarLocER.accuracy = (1-dataTarLocER.accuracy)*100 # make accuracy error rate #mobs = dataTarLoc['cond_tarLocation'].value_counts().values #pos = range(len(mobs)) # Plotting fig1 = plt.figure(figsize=(3.25, 6), dpi=100) ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1) sns.violinplot(x='cond_tarLocation', y='responseTime', data=dataTarLoc, cut=vioCut, saturation=vioSat, linewidth=vioLw, palette=pTarLoc) sns.swarmplot(x="cond_tarLocation", y="responseTime", data=dataTarLoc, color=swaCol, alpha=swaAlp, linewidth=swaLwE, edgecolor=swaColE) ax1.plot(range(len(means)), [means[0], means[1], means[2]], color=lpColor, marker=lpMarker, markersize=lpMarkerS, markeredgecolor=lpMarkerEC, markeredgewidth=lpMarkerEW, lw=lpLw, ls=lpLs, zorder=3)#, dashes=(0.75, 0.75)) ax1.set_xlabel('') ax1.set_ylabel("Response Time [in ms]") ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1) sns.pointplot(x='cond_tarLocation', y='accuracy', data=dataTarLocER, color=lpColor, markers=lpMarker, ci=ci) ax2.axes.get_xaxis().set_visible(False) ax2.axes.get_xaxis().set_ticks([]) ax2.yaxis.set_ticks(np.arange(3, 11, 2)) plt.ylabel('Error Rate\n [in %]') sns.despine(offset=10, trim=True) plt.show() ### Figure 2 ### # Data selection: RT dataTarLocGrad = pd.pivot_table(data[(data.cond_disPresent == 'absent') & (data.RTquicker200 == 0)], values='responseTime', index='subject_nr', columns='TarDistanceFromColor') dataTarLocGrad = pd.melt( dataTarLocGrad.reset_index(), id_vars='subject_nr',
def model_(x_train, y_train, x_test, y_test, boost_type='lgb'): tStart = time.time() if boost_type == 'lgb': model = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', learning_rate=0.01, n_estimators=9000, max_depth=8, min_child_weight=5, scale_pos_weight=9, # refer: 70 subsample=0.7, colsample_bytree=0.7, subsample_freq=1, n_jobs=-1) elif boost_type == 'xgb': model = XGBClassifier(learning_rate=0.025, tree_method='gpu_hist', n_estimators=6000, max_depth=9, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=-1, scale_pos_weight=11, seed=27) print('--' * 25) print('Start training ...') model.fit(x_train, y_train) yp_train = model.predict_proba(x_train)[:, 1] yp_valid = model.predict_proba(x_test)[:, 1] print( f'Use time: { np.int_((time.time()-tStart)/60) } mins\nCaluate prob ...' ) ## probability tune mat = np.zeros([5, 100]) for threshold in range(100): y_pred_train = np.int_(yp_train > threshold * 0.01) y_pred_valid = np.int_(yp_valid > threshold * 0.01) mat[0, threshold] = round(threshold * 0.01, 2) mat[1, threshold] = f1_score(y_train, y_pred_train) mat[2, threshold] = f1_score(y_test, y_pred_valid) mat[3, threshold] = (y_train == y_pred_train).mean() mat[4, threshold] = (y_test == y_pred_valid).mean() # Fig1 for F1 sns.pointplot(x=mat[0, :], y=mat[1, :], color='r') sns.pointplot(x=mat[0, :], y=mat[2, :], color='b') plt.title(f'{boost_type} F1 performance', color='r') plt.show() # Fig2 for acc sns.pointplot(x=mat[0, 10:], y=mat[3, 10:], color='r') sns.pointplot(x=mat[0, 10:], y=mat[4, 10:], color='b') plt.title(f'{boost_type} Acc performance', color='r') plt.show() print('--' * 20) # reult for best probalility best_prob = round(np.argmax(mat[2, :]) * 0.01, 2) print('Valid Result:\nprob: {}, F1 : {}, acc : {}'.\ format(best_prob,max(mat[2,:]).round(3), mat[4,:][np.argmax(mat[2,:])].round(3))) print('--' * 20) # confusion matrix y_pred_train = np.int_(yp_train > best_prob) y_pred_valid = np.int_(yp_valid > best_prob) print('Train confusion matrix') display( pd.crosstab(y_train, y_pred_train, margins=True, margins_name="Total")) print('--' * 20) print('Valid confusion matrix') display( pd.crosstab(y_test, y_pred_valid, margins=True, margins_name="Total")) print('--' * 20) print('Feature Importance (Top 10)') display(pd.DataFrame({'feature':feature,'gain':model.feature_importances_}).\ sort_values(by='gain',ascending=False).iloc[0:10,:]) print('--' * 25) return model, best_prob
pd.cut(data_train['Age'], bins=5) #[(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]] pd.cut(data_test['Age'], bins=5) #[(0.0942, 15.336] < (15.336, 30.502] < (30.502, 45.668] < (45.668, 60.834] < (60.834, 76.0]] # Now we will create age bins for the full dataset ages = data_full['Age'] ages = ages.append(pd.Series([0, 80])) bins = pd.cut(ages, bins=8, labels=[1, 2, 3, 4, 5, 6, 7, 8]) data_full['AgeBin'] = bins[:-2].astype(float) fig = plt.figure(figsize=(15, 8)) sns.pointplot(x='AgeBin', y='Survived', ci=95.0, hue='Sex', data=data_full, dodge=True) # From all this it looks like it would be good to create a dummy for kids - since agebin does not really provide a whole lot of differentiation except in the agebin=1 category # In[ ]: data_full['IsKid'] = 0 data_full.loc[data_full['AgeBin'] == 1, 'IsKid'] = 1 # ** Dealing with missing values in the age feature ** # # Age is the feature that has the most missing values after the cabin feature. In this case I want to fill the agebin with the mode of the agebin based on the title of the person. This especially seems to make sense for Master/Mr and Mrs/Miss where the title allows us to extract information about age based instead of just picking at random. # In[ ]:
t_p1 = (start_hrf + t_p) * x_bins / max_val_x r_t1 = (start_hrf + r_t) * x_bins / max_val_x # d_p2 = d_p1 + sec_hdrf * x_bins / max_val_x t_p2 = t_p1 + sec_hdrf * x_bins / max_val_x r_t2 = r_t1 + sec_hdrf * x_bins / max_val_x y_vl_min = df_all_by_subj.Decoding_error.min() y_vl_max = df_all_by_subj.Decoding_error.max() range_hrf = [float(5) / x_bins, float(6) / x_bins] # paper_rc = {'lines.linewidth': 2, 'lines.markersize': 2} sns.set_context("paper", rc=paper_rc) sns.pointplot(x='timepoint', y='Decoding_error', hue='ROI', data=df_all_by_subj, size=5, aspect=1.5) ##all subj visual paper_rc = {'lines.linewidth': 0.25, 'lines.markersize': 0.5} sns.set_context("paper", rc=paper_rc) for a in ['visual', 'ips']: if a == 'visual': c = 'b' elif a == 'ips': c = 'darkorange' for s in df_all_by_subj.subj.unique(): sns.pointplot( x='timepoint', y='Decoding_error', data=df_all_by_subj.loc[(df_all_by_subj['ROI'] == a)
import matplotlib.pyplot as plt import pandas as pd import seaborn as sns if __name__ == '__main__': exp_name = 'output/exp_p3_w4_s4_deadline' general_file = f'{exp_name}-general_data.csv' print(f'Loading data file: {general_file}') df = pd.read_csv(general_file) print(df) plt.figure() sns.pointplot(data=df, x='epoch', y='accuracy') plt.title('Accuracy per epoch') plt.show() plt.figure() # sns.pointplot(data=df[df['epoch'] > 1], x='epoch', y='duration') sns.pointplot(data=df, x='epoch', y='duration') plt.title('Train time per epoch') plt.show() dfs = [] for file in [ f'{exp_name}_client1_epochs.csv', f'{exp_name}_client2_epochs.csv', f'{exp_name}_client3_epochs.csv', f'{exp_name}_client4_epochs.csv' ]: dfs.append(pd.read_csv(file)) client_df = pd.concat(dfs, ignore_index=True)
sns.countplot(x='Gamma', hue='Segmentation', data=s2_tads_all) plt.xticks(rotation=90) plt.show() # Let's compare the lengths of TADs depending on gamma with these two segmentation methods. # In[16]: s2_tads_all['Length'] = s2_tads_all['End']-s2_tads_all['Start'] s2_tads_all['Length'] = s2_tads_all['Length'].astype(int) # In[17]: sns.pointplot(x='Gamma', y='Length', hue='Segmentation', data=s2_tads_all, zorder=15) sns.stripplot(x='Gamma', y='Length', hue='Segmentation', data=s2_tads_all, jitter=True, zorder=1, alpha=0.5) handles, labels = plt.gca().get_legend_handles_labels() #Have to do this because of a weird legend behavior otherwise... plt.legend(handles[:2], labels[:2], loc='upper right') plt.xticks(rotation=90) plt.yscale('log') plt.show() # And again, let's look at the Score in the same way. # In[18]: sns.pointplot(x='Gamma', y='Score', hue='Segmentation', data=s2_tads_all, zorder=15) sns.stripplot(x='Gamma', y='Score', hue='Segmentation', data=s2_tads_all, jitter=True, zorder=1, alpha=0.5) handles, labels = plt.gca().get_legend_handles_labels()
'world_rank', 'year' ]] times_plot_data['source'] = 'Times' shanghai_plot_data['source'] = 'Shanghai' cwur_plot_data['source'] = 'CWUR' # parse the first number in rank for data ranges times_plot_data['world_rank'] = times_plot_data['world_rank'].str.split( '-').str[0] shanghai_plot_data['world_rank'] = shanghai_plot_data['world_rank'].str.split( '-').str[0] plot_data = times_plot_data.append(shanghai_plot_data).append(cwur_plot_data) plot_data['world_rank'] = plot_data['world_rank'].astype(int) ax = sns.pointplot(x='year', y='world_rank', hue='source', data=plot_data) # Styling plt.title(my_university_name[0] + " Ranking", fontsize=26) plt.xticks(fontsize=20) plt.yticks(fontsize=20) plt.ylabel("World Rank", fontsize=26) plt.xlabel("Year", fontsize=26) plt.tight_layout() plt.legend(loc='upper left', fontsize=20) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left()
plt.figure(figsize=(12, 8)) sns.countplot(x="pickup_hour", data=train_df) plt.ylabel('Count', fontsize=12) plt.xlabel('pick up hour', fontsize=12) plt.xticks(rotation='vertical') plt.show() # The distribution shows the car demand with pick up hour time. After mid night less number's of trips are taken. Now let us see how the trip duration changes with respect to trip time. # In[ ]: grouped_df = train_df.groupby('pickup_hour')['trip_duration'].aggregate( np.median).reset_index() plt.figure(figsize=(12, 8)) sns.pointplot(grouped_df.pickup_hour.values, grouped_df.trip_duration.values, alpha=0.8, color=color[3]) plt.ylabel('median trip duration', fontsize=12) plt.xlabel('pick up hour', fontsize=12) plt.xticks(rotation='vertical') plt.show() # In[ ]: # Group by day grouped_df = train_df.groupby('day_week')['trip_duration'].aggregate( np.median).reset_index() plt.figure(figsize=(12, 8)) sns.pointplot(grouped_df.day_week.values, grouped_df.trip_duration.values, alpha=0.8,
temp_series = order_products_prior_df['department'].value_counts() labels = (np.array(temp_series.index)) sizes = (np.array((temp_series / temp_series.sum()) * 100)) plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=200) plt.title("Departments distribution", fontsize=15) plt.show() print 'press any key to continue' raw_input('') grouped_df = order_products_prior_df.groupby( ["department"])["reordered"].aggregate("mean").reset_index() plt.figure(figsize=(12, 8)) sns.pointplot(grouped_df['department'].values, grouped_df['reordered'].values, alpha=0.8, color=color[2]) plt.ylabel('Reorder ratio', fontsize=12) plt.xlabel('Department', fontsize=12) plt.title("Department wise reorder ratio", fontsize=15) plt.xticks(rotation='vertical') plt.show() print 'press any key to continue' raw_input('') grouped_df = order_products_prior_df.groupby( ["department_id", "aisle"])["reordered"].aggregate("mean").reset_index() fig, ax = plt.subplots(figsize=(12, 20)) ax.scatter(grouped_df.reordered.values, grouped_df.department_id.values) for i, txt in enumerate(grouped_df.aisle.values):
# sys.exit(0) # s = '1_fs__k' if '1_fs__k' in parameters else 'jmi__k_feat' # s = 'jmi__k_feat' s = 'clf__n_neighbors' # s = 'nb__alpha' # s = 'clf3__gr__group' # s = 'clf1__fs__k' # s = 'fs__k' # s = 'clf3__fs__k' # s = 'clf3__xgb__n_estimators' for p in parameters.keys(): plt.figure() # sns.pointplot(x=p, hue='max_depth', y='result', data=df_results[df_results['folds'] == 2], estimator=np.median) sns.pointplot(x=p, hue=s, y='result', data=df_results, estimator=np.median) plt.ylim(0.6, 0.8) if '1_clf__C' in parameters: plt.figure() plt.subplot(311) sns.pointplot(x='1_clf__C', y='result_1', data=df_results, estimator=np.median, color='r') plt.legend('patient 1') plt.grid() plt.subplot(312) sns.pointplot(x='2_clf__C', y='result_2',
# d = rs.normal(size=1000) # f, axes = plt.subplots(2, 2, figsize=(7, 10), sharex=False) # sns.distplot(d, kde=False, color="b", ax=axes[0, 0]) # sns.distplot(d, hist=False, rug=True, color="r", ax=axes[0, 1]) # sns.distplot(d, hist=False, color="g", kde_kws={"shade": True}, ax=axes[1, 0]) # sns.distplot(d, color="m", ax=axes[1, 1]) # plt.show() #eg4 箱型图 # iris = sns.load_dataset("data",data_home="/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4") # sns.boxplot(x = iris['萼片_长度'],y = iris['品种']) # plt.show() #eg5 # iris = sns.load_dataset("data",data_home="/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4") # sns.jointplot("萼片_长度", "花瓣_长度", iris) # plt.show() #eg6 iris = sns.load_dataset( "data", data_home= "/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4") plt.figure(figsize=(12, 8)) sns.pointplot(iris.萼片_长度.values, iris.品种.values, alpha=0.8, color='blue') plt.ylabel('品种', fontsize=12) plt.xlabel('萼片_长度', fontsize=12) plt.xticks(rotation='vertical') plt.show()
fs = [] for k in range(1): for i in ['fc','fs_5dis','fs_6dis']: fs+=[i for j in range(fs_ref.shape[1])] ref = [] for k in ['Rep_50']: ref += [k for i in range(3*fs_ref.shape[1])] data = pd.DataFrame({'icc':icc, 'icc_msr':icc_msr, 'icc_mse':icc_mse, 'fs':fs, 'ref': ref}) plt.figure(figsize=(20, 10)) sns.pointplot(x="ref", y="icc_msr", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07) sns.stripplot(x="ref", y="icc_msr", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSr') pt.half_violinplot(x="ref", y="icc_msr", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5) plt.legend(ncol=2) plt.savefig(plotd+'icc_msr.png')###### plt.close() plt.figure(figsize=(20, 10)) sns.pointplot(x="ref", y="icc_mse", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07) sns.stripplot(x="ref", y="icc_mse", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSe') pt.half_violinplot(x="ref", y="icc_mse", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5) t1,p1 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3):int(len(icc_mse)/3*2)], nan_policy ='omit', equal_var=False) t2,p2 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3*2):], nan_policy ='omit', equal_var=False) plt.text(-0.15,0,'T: '+str(round(t1,5))+'\n'+'P: '+str(round(p1,5)),fontsize=18)
def numerical_pca_egv(df, conf_dict, col1, col2, col3): # Do the PCA. n_components = len(conf_dict['NumericalColumns']) df2 = df[conf_dict['NumericalColumns']] scaler = StandardScaler() scaler.fit(df2) df2 = scaler.transform(df2) df2 = pd.DataFrame(df2, columns = conf_dict['NumericalColumns']) pca = PCA(n_components=n_components) reduced = pca.fit_transform(df2) # Append the principle components for each entry to the dataframe for i in range(0, n_components): df2['PC' + str(i + 1)] = reduced[:, i] #display(df2.head()) if col1 not in conf_dict['NumericalColumns']: df.reset_index(drop=True, inplace=True) df2.reset_index(drop=True, inplace=True) df2[col1] = df[col1] # Show the points in terms of the first two PCs g = sns.lmplot(('PC' + str(col2)), ('PC' + str(col3)), hue=col1, data=df2, fit_reg=False, scatter=True, size=7) plt.show() # Plot a variable factor map for the first two dimensions. (fig, ax) = plt.subplots(figsize=(8, 8)) for i in range(0, len(pca.components_)): ax.arrow(0, 0, # Start the arrow at the origin pca.components_[int(col2) - 1, i], #0 for PC1 pca.components_[int(col3) - 1, i], #1 for PC2 head_width=0.05, head_length=0.08) plt.text(pca.components_[int(col2) - 1, i] + 0.05, pca.components_[int(col3) - 1, i] + 0.05, df2.columns.values[i]) an = np.linspace(0, 2 * np.pi, 100) plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale plt.axis('equal') ax.set_title('Variable factor compass') plt.show() # Do a scree plot ind = np.arange(0, n_components) (fig, ax) = plt.subplots(figsize=(8, 6)) sns.pointplot(x=ind, y=pca.explained_variance_ratio_) ax.set_title('Scree plot') ax.set_xticks(ind) ax.set_xticklabels(ind) ax.set_xlabel('Component Number') ax.set_ylabel('Explained Variance') plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns ### matplotlib inline data_train = pd.read_csv('../input/train.csv') data_test = pd.read_csv('../input/test.csv') data_train.sample(3) sns.barplot(x="Embarked", y="Survived", hue="Sex", data=data_train) sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data_train, palette={ "male": "blue", "female": "pink" }, markers=["*", "o"], linestyles=["-", "--"]) def simplify_ages(df): df.Age = df.Age.fillna(-0.5) bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120) group_names = [ 'Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior' ] categories = pd.cut(df.Age, bins, labels=group_names) df.Age = categories