def visualize_importance(self): feature_importance_df = pd.DataFrame() for i, model in enumerate(self.models): _df = pd.DataFrame() self.importance = model.get_feature_importance() _df['feature_importance'] = model.get_feature_importance() _df['column'] = self.feature_cols.tolist() _df['fold'] = i + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) order = feature_importance_df.groupby('column').sum()[[ 'feature_importance' ]].sort_values('feature_importance', ascending=False).index[:50] fig, ax = plt.subplots(2, 1, figsize=(max(6, len(order) * .4), 14)) sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax[0], palette='viridis') ax[0].tick_params(axis='x', rotation=90) ax[0].grid() fig.tight_layout() return fig, ax
def plot_entanglement_boxes(data, plot_vars, save=True): """Boxplot planning time vs entanglement""" if data.empty: return plt.rcParams.update({'font.size': cfg.FONTSIZE}) _, ax = plt.subplots(figsize=cfg.FIGSIZE) sns.boxenplot(x='entanglement', y='transitions', data=data, color='C0', ax=ax, showfliers=False) n_values = list(data['n_values'])[0] ax.set_ylim(plot_vars.ylim) plt.xlabel('Effect size') ax.set_ylabel('Generated states') ax.set_yscale('linear') ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%2.0f')) ax.yaxis.set_major_locator(ticker.MultipleLocator(plot_vars.tick_size)) ax.set_ylabel('Generated states' + autoscale_yticks(ax, dtype=int)) # sns.despine() plt.tight_layout() plt.subplots_adjust(top = .95, bottom = .2, right = .95, left = 0.25, hspace = 0, wspace = 0) plt.margins(0,0) if save: plt.savefig('results/plots/{}/{}_{}ary.png'.format( cfg.DIR, cfg.NAME, n_values), dpi=100)
def visualize_importance(models, feat_train_df): """lightGBM の model 配列の feature importance を plot する CVごとのブレを boxen plot として表現します. args: models: List of lightGBM models feat_train_df: 学習時に使った DataFrame """ feature_importance_df = pd.DataFrame() _df = pd.DataFrame() _df['feature_importance'] = models.feature_importance() _df['column'] = feat_train_df.columns feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) order = feature_importance_df.groupby('column') \ .sum()[['feature_importance']] \ .sort_values('feature_importance', ascending=False).index[:50] fig, ax = plt.subplots(figsize=(len(order) * .4, 7)) sns.boxenplot(data=feature_importance_df, x='column', y='feature_importance', order=order, ax=ax, palette='viridis') ax.tick_params(axis='x', rotation=90) fig.tight_layout() return fig, ax
def sns_boxenplot(self, dataframe, x_name=None, y_name=None, hue_str=None): """ Boxenplot - similar to box plots but provides more information about the distribution as it plots more quantiles. Useful for large datasets Parameters ---------- dataframe : dataframe data container x_name : str, optional column name for the x-axis. The default is None. y_name : str, optional column name for the y-axis. The default is None. hue_str : str, optional name of categorical data to color by. The default is None. Returns ------- None. eg. diamonds = sns.load_dataset('diamonds').sort_values('color') myplot.sns_boxenplot(diamonds,'color','price') """ sns.boxenplot(data=dataframe, x=x_name, y=y_name, hue=hue_str, palette='deep')
def graph_univar_pred(df_data, list_var, col_predict, var_type): """ Graph univariate for each variable in list compared with numerical prediction (label in dF) var_type = "num" if numeric: Violin Chart "cat" if categorical: Barplot count """ # Setup the graph subplot fig, axs = plt.subplots(len(list_var), 1, figsize=set_size(plt_wd, len(list_var), 1)) fig.subplots_adjust(hspace=0.3) # Adjust space between rows # loop for graphing for i, item in enumerate(list_var): cond_a = df_data[item].isna() == False cond_b = df_data[col_predict] != np.nan dF_dummy = df_data.loc[cond_a & cond_b] if var_type == "num": sns.boxenplot(x=col_predict, y=item, data=dF_dummy, ax=axs[i]) if var_type == "cat": sns.countplot(x=item, hue=col_predict, data=dF_dummy, ax=axs[i]) # Add the percentage in the graph total = dF_dummy.shape[0] for p in axs[i].patches: percentage = '{:.0f}%'.format(100 * p.get_height() / total) x = p.get_x() + p.get_width() + 0.02 y = p.get_y() + p.get_height() / 2 axs[i].annotate(percentage, (x, y)) return
def detect_outliers(df, target): for col in df: plt.figure(figsize=(14, 11)) plt.title(col) plt.suptitle('Detect Outliers') if df[col].dtypes == object: sb.boxplot(df[col], df[target], color='gray') sb.boxenplot(df[col], df[target]) else: plt.subplot(311) try: sb.distplot(df[col], hist=False, rug=True) except: sb.kdeplot(df[col], bw=0.3) plt.subplot(312) sb.scatterplot(df[col], df[target]) plt.subplot(313) sb.boxenplot(df[col]) sb.boxplot(df[col], color='gray')
def plot_grouped_boxplot(first_level_tests, attn_rois): colnames = list(first_level_tests) sessions = pd.unique([c.split()[0] for c in colnames]) connections, mirrors = mirror_strfind(attn_rois) melty = first_level_tests.melt(var_name='Old Columns', value_name='Phase-phase coupling') filo, raph = [], [] for old_col in melty['Old Columns'].values: raph.append(old_col.split()[0]) filo.append(old_col.split()[1]) melty['Connection'] = filo melty['Session'] = raph for mir in mirrors: idx = melty[melty['Connection'] == mir].index melty.drop(idx, inplace=True) print(melty) sns.set(style='darkgrid') fig, ax = plt.subplots(figsize=(16, 9)) sns.boxenplot(x='Connection', y='Phase-phase coupling', hue='Session', data=melty)
def _summarize_stats_in_epochs(self, ax_auc: plt.Axes, ax_spikes: plt.Axes): """ Add axes to the main plot showing the dF/F statistics in the different epochs """ df_auc = pd.DataFrame( # 1000 is max number of components per FOV np.full((1000, len(self.epochs_to_display)), np.nan), columns=self.epochs_to_display, ) df_spikes = df_auc.copy() for epoch in self.epochs_to_display: cur_data = filter_da(self.fov.fluo_analyzed, epoch=epoch) if cur_data.shape[0] == 0: continue spikes = dff_tools.locate_spikes_scipy(cur_data, self.fov.metadata.fps, thresh=0.7) auc = dff_tools.calc_total_auc_around_spikes( spikes, cur_data, self.fov.metadata.fps) df_auc[epoch][:len(auc)] = auc spikes = dff_tools.calc_mean_spike_num(spikes, cur_data, fps=self.fov.metadata.fps) df_spikes[epoch][:len(spikes)] = spikes sns.boxenplot(data=df_auc, ax=ax_auc) sns.boxenplot(data=df_spikes, ax=ax_spikes) for ax in [ax_auc, ax_spikes]: ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.set_xlabel("Epoch") ax_auc.set_ylabel("AUC") ax_spikes.set_ylabel("Spikes per second")
def get_feature_importance(self, train_feat_df: pd.DataFrame, is_save=False, filepath=None): feature_importance_df = pd.DataFrame() num = 0 for i, model in self.models.items(): _df = pd.DataFrame() _df['feature_importance'] = model.feature_importances_ _df['column'] = train_feat_df.columns _df['fold'] = num + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) num += 1 order = feature_importance_df.groupby('column')\ .sum()[['feature_importance']]\ .sort_values('feature_importance', ascending=False).index[:50] fig, ax = plt.subplots(figsize=(8, max(6, len(order) * .25))) if is_save: fig.savefig(filepath + "lgbm_feature_importance.png") _df.to_csv(filepath + "lgbm_feature_importance.csv", index=False) sns.boxenplot(data=feature_importance_df, x='feature_importance', y='column', order=order, ax=ax, palette='viridis', orient='h') ax.tick_params(axis='x', rotation=90) ax.set_title('Lightgbm Feature Importance') ax.grid()
def plot_distribution(process_df, fig=None): results_df = vesicle_release_distribution(process_df) release_durations = np.unique(process_df.release_duration) time = np.unique(process_df.time) if not fig: fig = plt.figure() grid_shape = (8, 1) totals = [ results_df[results_df.release_duration == r].shape[0] for r in np.unique(process_df.release_duration) ] ax_1 = plt.subplot2grid(shape=grid_shape, loc=(0, 0), rowspan=4, fig=fig) sns.swarmplot(x='release_duration', y='offset', data=results_df, ax=ax_1) sns.boxenplot(x='release_duration', y='offset', data=results_df, ax=ax_1) # ax_1.set_xticks([]) ax_1.xaxis.tick_top() ax_1.xaxis.set_label_position('top') ax_1.xaxis.set_ticklabels([*totals]) ax_1.xaxis.set_tick_params(length=0) ax_1.set_xlabel('Total released') ax_1.set_title('Release offset distribution') ax_2 = plt.subplot2grid(shape=grid_shape, loc=(4, 0), rowspan=4, fig=fig) sns.boxplot(x='release_duration', y='num_released', data=results_df, ax=ax_2) sns.pointplot(x='release_duration', y='num_released', data=results_df, ax=ax_2, color='black') ax_2.set_xlabel('Release Duration') ax_2.set_ylabel('# of vesicles released') ax_2.set_title('# Vesicles per spike distribution')
def information_loss_plot(df, params): """ Plot information loss as boxen plot :param df: Dataframe :param params: Parameters :return: None """ plt.rcParams.update({"font.size": 20}) new_df, save_string = filter_dataframe(df, params, ignore_sampling=True) df_li = create_li_df(new_df) df_li.sort_values("li_type", inplace=True) plt.figure(figsize=(15, 8)) sb.boxenplot( x="li_type", y="li", hue=params["x"], hue_order=sorted(new_df[params["x"]].drop_duplicates().tolist()), data=df_li, dodge=True, ) plt.title(params["name"], fontsize=16) plt.xlabel("Type of Lost Information") plt.ylabel("Lost Information") plt.ylim(-0.5, 1.) if params["save_plot"]: curr_dir = os.getcwd() Path(curr_dir + "/figures/data_analysis").mkdir(parents=True, exist_ok=True) save_name = curr_dir + "/figures/data_analysis/%s_lost_information.png" % save_string plt.savefig(save_name) plt.close() else:
def draw_box_plots(self, data, xaxis, plot_name, xlabel, ylabel, plot_saved_path, plot_format): """ Draw box plots according to different parameters. :param data: pandas.Dataframe The data used to draw the graph. :param xaxis: list The values of X-axis. :param plot_name: string The name of this graph. :param xlabel: string The label of X-axis. :param ylabel: string The label of Y-axis. :param plot_saved_path: string The file path to save the graph. :param plot_format: string png or other formats. :return: None """ sns.boxenplot(data=data, order=xaxis) # Draw the box plot. plt.title(plot_name, config.new_ft) plt.xlabel(xlabel, config.new_ft) plt.ylabel(ylabel, config.new_ft) if self.is_show: else: plt.savefig(config.path_for_thesis + plot_saved_path, format=plot_format, dpi=self.dpi) plt.close()
def target_distribution_over_binary_groups(df, binary_cols, target_col, plot_type='boxenplot', **plot_kwargs): '''For use during feature engineering. Pass a DataFrame with a list of `binary_cols` that represent the names of columns that are binary categories. The `target_col` str is the variable you are trying to model. Requires seaborn >= 0.9.0. ''' for col in binary_cols: if plot_type=='boxenplot': sns.boxenplot(y=df[target_col], x=df[col], **plot_kwargs) elif plot_type=='violinplot': sns.violinplot(y=df[target_col], x=df[col], **plot_kwargs) else: sns.boxplot(y=df[target_col], x=df[col], **plot_kwargs) ax = plt.gca() mu0, mu1 = df[target_col].groupby(df[col]).mean() sd0, sd1 = df[target_col].groupby(df[col]).std() ncol = df.loc[df[col]==1].shape[0] ax.axhline(mu0, label=f'mean = {round(mu0, 2)}|{col} = 0', color='blue', linestyle=':') ax.axhline(mu1, label=f'mean = {round(mu1, 2)}|{col} = 1 with {ncol} observations', color='orange', linestyle='-.') ax.grid(alpha=.4) ax.set_title(col) sns.despine() ax.legend(loc='best')
def make_boxenplot_chem(low_col, high_col, xlabel_low, xlabel_high, ylabel, low_color, high_color, out_name): fig = plt.figure() x = range(2) f, axes = plt.subplots(1, 2, sharey=True, sharex=True) p1 = sns.boxenplot(y=low_col, orient='vertical', ax=axes[0], color=low_color).set(xlabel = xlabel_low, ylabel = ylabel) p2 = sns.boxenplot(y=high_col, orient='vertical', ax=axes[1], color=high_color).set(xlabel = xlabel_high, ylabel = '') plt.savefig(out_name + '.png', bbox_inches='tight')
def visualize_importance(models, train_feat_df, importance_type="gain"): feature_importance_df = pd.DataFrame() for i, model in enumerate(models): _df = pd.DataFrame() _df["feature_importance"] = model.feature_importances_ _df["column"] = train_feat_df.columns _df["fold"] = i + 1 feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True) order = (feature_importance_df.groupby("column").sum()[[ "feature_importance" ]].sort_values("feature_importance", ascending=False).index[:50]) fig, ax = plt.subplots(figsize=(8, max(6, len(order) * 0.25))) sns.boxenplot( data=feature_importance_df, y="column", x="feature_importance", order=order, ax=ax, palette="viridis", ) ax.tick_params(axis="x", rotation=90) ax.grid(True) fig.tight_layout() return fig, ax
def explore_data_catbin(to_explore, df, target, pred_type='cat'): """ Generates visualizations to explore the relationship between predictors and a binary categorical target. Specify the type of predictors using the `pred_type` parameter: accepted values are `cat` for categorical and `cont` for continuous. This function assumes a binary target. """ if pred_type not in ['cat', 'cont']: print("Error: `pred_type` should be 'cat' for categorical\ predictors and 'cont' for continuous predictors. No other\ values accepted.") return None disc = True if pred_type == 'cat' else False # get mean of target. Since target is binary, mean is representative # of the proportion of 1 labels to 0 labels pop_mean = np.round(df[target].mean(), 4) # draw plots for col in to_explore: fig, [ax1, ax2] = plt.subplots(figsize=(10, 5), nrows=1, ncols=2) plt.tight_layout(pad=3) sns.histplot(data=df, x=col, ax=ax1, discrete=disc) ax1.set_title(f"Distribution of {col}") if pred_type == 'cat': sns.pointplot(data=df, x=col, y=target, ci=68, ax=ax2, join=False, scale=1.5, capsize=0.05) ax2.set_title("Target Mean per Category") ax2.axhline(pop_mean, color='red', ls='dashed', label='population mean') ax2.legend() elif pred_type == 'cont': sns.boxenplot(data=df, x=col, y=target, ax=ax2, orient='h', width=1) ax2.set_title("Feature Distribution Per Target Class") return None
def draw_boxplots(data_frame_scaled): plt.rcParams['figure.figsize'] = (40, 35) plt.subplot(3, 3, 1) sns.set_theme(style="whitegrid") # sns.boxplot(data = data_scaled,palette="Set3", linewidth=2.5) sns.boxenplot(data=data_frame_scaled, orient="h", palette="Set3") # sns.stripplot(data=data,orient="h",size=4, color=".26") plt.title('box plots types', fontsize=10)
def plot_class_proba(model, x, y, show_graph=True, label_encoder=None): if label_encoder is not None: y = label_encoder.inverse_transform(y) df = pd.DataFrame({'Class': y, 'Probability': model.predict_proba(x)[:, 1]}) sns.boxenplot(x='Class', y='Probability', data=df) if show_graph:
def make_boxenplot_AH(holo_col, apo_col, xlabel, ylabel, title, out_name): fig = plt.figure() x = range(2) f, axes = plt.subplots(1, 2, sharey=True, sharex=True) p1 = sns.boxenplot(holo_col, orient='v', ax=axes[0]).set(xlabel='Holo', ylabel=ylabel) p2 = sns.boxenplot(apo_col, orient='v', ax=axes[1]).set(xlabel='Apo', ylabel='') plt.savefig(out_name + '.png')
def subplot_draw1(): fig = plt.figure(figsize=(10, 6)) for i in range(10): plt.subplot(2, 6, i + 1) # subplots 表示分布绘制系列图 sns.boxenplot(df[colnm[i]], orient="v", width=0.5, color=color[0]) plt.ylabel(colnm[i], fontsize=12) # plt.subplots_adjust(left=0.2, wspace=0.8, top=0.8) plt.tight_layout() # 会自动调整子图参数,使之填充整个图像区域。避免重叠
def make_boxenplot_chem(low_col, high_col, xlabel_low, xlabel_high, ylabel, out_name): fig = plt.figure() x = range(2) f, axes = plt.subplots(1, 2, sharey=True, sharex=True) p1 = sns.boxenplot(low_col, orient='v', ax=axes[0]).set(xlabel=xlabel_low, ylabel=ylabel) p2 = sns.boxenplot(high_col, orient='v', ax=axes[1]).set(xlabel=xlabel_high, ylabel='') plt.savefig(out_name + '.png')
def four_rate_plot(property): fig, axs = plt.subplots(ncols=2, nrows=2, figsize=[20, 20], sharey=True) flat_axs = [ax for col in axs for ax in col] for rate, ax in zip(['delta', 'gamma', 'beta', 'alpha'], flat_axs): sns.boxenplot(x=rate, y=property, data=all_tests, ax=ax, color='grey') sns.stripplot(x=rate, y=property, data=all_tests, ax=ax, marker='.', size=3, jitter=True, color='black') sns.despine()
def wykres_9(x, y, nazwa_wykres, nazwa_x, nazwa_y): f, ax = plt.subplots(figsize=(12, 8)) ax.set_title(nazwa_wykres, fontsize=16) ax.set_ylabel(nazwa_y, fontsize=14) ax.set_xlabel(nazwa_x, fontsize=14) sns.boxenplot(data=selected_data, x=selected_data[x], y=selected_data[y], scale="linear") return f
def plot_annotation_entropy(adata_map, annotation='cell_type'): """ """ qk = np.ones(shape=(adata_map.n_obs, adata_map.n_vars)) adata_map.obs['entropy'] = entropy(adata_map.X, base=adata_map.X.shape[1], axis=1) fig, ax = plt.subplots(1, 1, figsize=(10, 3)) ax.set_ylim(0, 1) sns.boxenplot(x=annotation, y="entropy", data=adata_map.obs, ax=ax) plt.xticks(rotation=30)
def check_outlier(df): features = df.columns sns.set_style("whitegrid") plt.figure(figsize=(24, 8)) nonnumerical = ['Year', 'ShortName'] for feature in features: if not feature in nonnumerical: sns.boxenplot(x=feature, orient='h', data=df) title = 'boxplot ' + feature plt.title(title) plt.savefig(filepath + 'inspection/boxplot/' + title + '.png') plt.clf()
def plot_finish(finish_dict, experiment): folder = plot_folder + 'finish/' if not os.path.isdir(folder): os.mkdir(folder) if len(finish_dict) == 0: print('No finish Data available') else: # leaving soc vs leaving time scatter plot finish_df = pd.concat(finish_dict.values(), axis=0) finish_df = finish_df.sort_values('method') sns.boxenplot(data=finish_df, x='method', y='time') plt.savefig(folder + experiment + '_boxplot' + '.png') plt.close()
def plot_graphs(feature_1, feature_2, df): fig, axs = plt.subplots(ncols=4) fig.set_figwidth(30) fig.set_figheight(8) plt.suptitle(feature_1 + ' vs. ' + feature_2) sns.boxenplot(x=feature_1, y=feature_2, data=df, ax=axs[0]) sns.boxplot(x=feature_1, y=feature_2, data=df, ax=axs[1]) sns.violinplot(x=feature_1, y=feature_2, data=df, inner="points", ax=axs[2]) sns.barplot(x=feature_1, y=feature_2, data=df, ax=axs[3])
def draw_boxplot(data: pd.DataFrame) -> None: f, axes = plt.subplots(7, 4, figsize=(18, 24)) global date_indexs, labels count = 0 for i in [ x for x in data.columns if x not in date_indexs + labels + ['date'] ]: sns.boxenplot(x=i, y='total_purchase_amt', data=data, ax=axes[count // 4][count % 4]) count += 1
def _BoxPlot(self): '''This method is used to plot the Boxen plots of all Categorical variables in dataframe''' df = self.train_df.copy() fig = plt.figure(figsize=(14, 18)) for idx, col in enumerate(self.cat_cols): if len(self.train_df[col].unique()) < 10: df[col + '_mean'] = df.groupby(col)[self.target_col].transform('mean') fig.add_subplot(3, 2, idx+1) sns.boxenplot(x=col, y=self.target_col, data=df.sort_values(col + '_mean')) plt.title('Comparison of salaries as per {}'.format( col), fontsize=14) plt.tight_layout() plt.xticks(rotation=45)
def line_integrals(self): for a, kind in zip([self.lines, self.lines_filtered], ["all", "flt"]): # for a, kind in zip([self.lines_filtered], ["flt"]): # optional: filter a subgroup # col_order = ["Arrest", "Cycling"] # col_order = ['Cycling', 'Arrest', 'Release', # 'Cyto2ug-Cyc', 'Cyto2ug-Arr', 'Cyto2ug-Rel', # 'Noc20ng-Cyc', 'Noc20ng-Arr', 'Noc20ng-Rel'] col_order = a["Compound"].unique() print(col_order) a = a[a["Compound"].isin(col_order)] # get only one row per z-stack idx = a.groupby(["unit"])["s_max"].transform(max) == a["s_max"] a = a.loc[idx] fig = plt.figure(figsize=(8, 8), dpi=150) ax = fig.gca() sns.boxenplot(x="Compound", y="sum", order=col_order, data=a) ax.yaxis.set_major_formatter(self.formatter) ax.set_yscale('log') ax.set_xticklabels(ax.xaxis.get_ticklabels(), rotation=45, multialignment='right') path = o.ensure_dir( os.path.join(, 'out', 'graphs', 'line_boxplot_%s.pdf' % kind)) fig.savefig(path) plt.close() fig = plt.figure(figsize=(8, 8), dpi=150) ax = fig.gca() sns.scatterplot(x="v_width", y="sum", data=a, hue="Compound", alpha=0.1, rasterized=True) # plt.xscale('log') # plt.yscale('log') ax.set_xlim((0, 16)) ax.set_ylim((0, 350e3)) ax.xaxis.set_major_formatter(self.formatter) ax.yaxis.set_major_formatter(self.formatter) path = o.ensure_dir( os.path.join(, 'out', 'graphs', 'lines_scatter_%s.pdf' % kind)) fig.savefig(path) plt.close()
""" Plotting large distributions ============================ """ import seaborn as sns sns.set(style="whitegrid") diamonds = sns.load_dataset("diamonds") clarity_ranking = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"] sns.boxenplot(x="clarity", y="carat", color="b", order=clarity_ranking, scale="linear", data=diamonds)