def eda_cat_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a category feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap, then no visualization panel is produced. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100, }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # groupby category feature and count the occurrences of target classes # for each level in category bi_summ_df = ( bi_df.groupby([feature] + [self.target.name]).size().reset_index().pivot( columns=self.target.name, index=feature, values=0)) # overwrite DataFrame index with actual class labels if provided bi_summ_df.columns = pd.Index( legend_labels) if legend_labels is not None else pd.Index( [i for i in bi_summ_df.columns.tolist()]) bi_summ_df.reset_index(inplace=True) # fill nan's with zero fill_columns = bi_summ_df.iloc[:, 2:].columns bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0) # set values to int dtype where applicable to optimize displayed DataFrame for column in bi_summ_df.columns: try: bi_summ_df[column] = bi_summ_df[column].astype(np.int) except ValueError: bi_summ_df[column] = bi_summ_df[column] ## proportion by category summary # combine feature column and target prop_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls prop_df = prop_df[prop_df[feature].notnull()] # calculate percent of 100 by class label prop_df = prop_df.groupby([feature, self.target.name ]).agg({self.target.name: {"count"}}) prop_df = prop_df.groupby( level=0).apply(lambda x: 100 * x / float(x.sum())) prop_df = prop_df.reset_index() multiIndex = prop_df.columns singleIndex = [i[0] for i in multiIndex.tolist()] singleIndex[-1] = "Count" prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) prop_df = pd.pivot_table(prop_df, values=["Count"], columns=[feature], index=[self.target.name], aggfunc={"Count": np.mean}) prop_df = prop_df.reset_index(drop=True) multiIndex = prop_df.columns singleIndex = [] for column in multiIndex.tolist(): try: singleIndex.append(int(column[1])) except ValueError: singleIndex.append(column[1]) prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target prop_df.insert(loc=0, column="Class", value=legend_labels if legend_labels is not None else np.unique(self.target)) # fill nan's with zero fill_columns = prop_df.iloc[:, :].columns prop_df[fill_columns] = prop_df[fill_columns].fillna(0) # if there are only two class labels, perform z-test/t-test if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2: # total observations total_obs1 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[0])][feature].shape[0] total_obs2 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[1])][feature].shape[0] # total positive observations pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0]) & (bi_df[self.target.name] == 1)][feature].shape[0] pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1]) & (bi_df[self.target.name] == 1)][feature].shape[0] # perform z-test, return z-statistic and p-value z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2), nobs=(total_obs1, total_obs2)) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion", "Statistical test", ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) else: # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion" ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) ### visualizations # set label rotation angle len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4: rotation = 0 else: rotation = 90 # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=0.82) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Category counts by target\n* {}".format(feature), position=132) # add faceted categorical plot to canvas p.facet_cat( df=bi_summ_df, feature=feature, label_rotate=rotation, color_map=color_map, bbox=(1.0, 1.15), alpha=0.8, legend_labels=legend_labels, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Target proportion by category\n* {}".format(feature), position=133) # add stacked bar chart to canvas p.stacked_bar_h( df=prop_df.drop("Class", axis=1), bbox=(1.0, 1.15), legend_labels=legend_labels, color_map=color_map, alpha=0.8, ax=ax, ) plt.show()
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15, color_map="viridis", title_scale=1.2, show_single_str_params=False): """ Documentation: --- Definition: Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to the distribution of values that were actually chosen, and visualizes how parameter value selections changes over time. --- Parameters: bayes_optim_summary : Pandas DataFrame Pandas DataFrame containing results from bayesian optimization process. estimator_class : str or sklearn api object Name of estimator to visualize. estimator_parameter_space : dictionary of dictionaries Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the value is a distribution of values from which trial values are drawn. n_iter : int Number of iterations to draw from theoretical distribution in order to visualize the theoretical distribution. Higher number leader to more robust distribution but can take considerably longer to create. chart_scale : float, default=15 Controls proportions of visualizations. larger values scale visual up in size, smaller values scale visual down in size. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.2 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. show_single_str_params : boolean, default=False Controls whether to display visuals for string attributes where there is only one unique value, i.e. there was only one choice for the optimization procedure to choose from during each iteration. """ # unpack bayes_optim_summary parameters for an estimator_class estimator_summary = self.unpack_bayes_optim_summary( bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class ) # override None with string representation estimator_summary = estimator_summary.replace([None], "None") # subset estimator_parameter_space to space for the specified estimator_class estimator_space = estimator_parameter_space[estimator_class] print("*" * 100) print("* {}".format(estimator_class)) print("*" * 100) # iterate through each parameter for param in estimator_space.keys(): # sample from theoretical distribution for n_iters theoretical_dist = [] for _ in range(n_iter): theoretical_dist.append(sample(estimator_space)[param]) ## override None with string representation # theoretical distribution theoretical_dist = ["none" if v is None else v for v in theoretical_dist] theoretical_dist = np.array(theoretical_dist) # actual distribution actual_dist = estimator_summary[param].tolist() actual_dist = ["none" if v is None else v for v in actual_dist] actual_dist = np.array(actual_dist) # limit estimator_summary to "iteration" and current "param" columns actual_iter_df = estimator_summary[["iteration", param]] # identify how many values in param column are zero or one zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum() # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE" if zeros_and_ones == actual_iter_df.shape[0]: actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"}) # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE" if isinstance(theoretical_dist[0], np.bool_): theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()]) estimator_summary = estimator_summary.replace([True], "TRUE") estimator_summary = estimator_summary.replace([False], "FALSE") # if theoretical distribution contains str data, then treat this as an object/category parameter if any(isinstance(d, str) for d in theoretical_dist): # generate color list for stripplot stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1) # generate color list for bar chart bar_color_list = style.color_gen(name=color_map, num=3) # identify unique values and associated count in theoretical distribution unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True) # if theoretical distribution only has one unique value and show_single_str_params is set to True if len(unique_vals_theo) > 1 or show_single_str_params: # identify unique values and associated count in actual distribution unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True) # store data in DataFrame df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual}) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # add faceted bar chart to canvas p.facet_cat( df=df, feature="param", color_map=bar_color_list[:-1], bbox=(1.0, 1.15), alpha=1.0, legend_labels=df.columns[1:].values, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.5, position=122, title_scale=title_scale, ) # add stripply to canvas sns.stripplot( x="iteration", y=param, data=estimator_summary, jitter=0.3, alpha=1.0, size=0.7 * chart_scale, palette=sns.color_palette(stripplot_color_list[:-1]), ax=ax, ).set(xlabel=None, ylabel=None) # set tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale) plt.show() # otherwise treat it as a numeric parameter else: # cast "iteration" as an int and the param values as float convert_dict = {"iteration": int, param: float} actual_iter_df = actual_iter_df.astype(convert_dict) # create color map color_list = style.color_gen(name=color_map, num=3) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # dynamically set x-unit precision based on max value if -1.0 <= np.nanmax(theoretical_dist) <= 1.0: x_units = "fff" elif 1.0 < np.nanmax(theoretical_dist) <= 5.0: x_units = "ff" elif np.nanmax(theoretical_dist) > 5.0: x_units = "f" # add kernsel density plot for theoretical distribution to canvas p.kde_plot( theoretical_dist, color=color_list[0], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) # add kernsel density plot for actual distribution to canvas p.kde_plot( actual_dist, color=color_list[1], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) ## create custom legend # create labels label_color = {} legend_labels = ["Theoretical", "Actual"] for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=Patches, fontsize=1.1 * chart_scale, loc="upper right", markerscale=0.6 * chart_scale, ncol=1, bbox_to_anchor=(.95, 1.1), ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") # dynamically set y-unit precision based on max value if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0: y_units = "fff" elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0: y_units = "ff" elif np.nanmax(actual_iter_df[param]) > 5.0: y_units = "f" # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=122, title_scale=title_scale, ) # add regression plot to canvas p.reg_plot( x="iteration", y=param, data=actual_iter_df, y_units=y_units, x_units="f", line_color=color_list[0], line_width=0.4, dot_color=color_list[1], dot_size=10.0, alpha=0.6, ax=ax ) plt.show()