def eda_missing_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the percent of values missing for each feature. Optionally displays the underlying Pandas DataFrame. --- Parameters: data : Pandas DataFrame, default=None Pandas DataFrame containing independent variables. If left as none, the feature dataset provided to Machine during instantiation is used. color : str or color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame in addition to chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # use dataset provided during instantiation if None if data is None: data = self.data # return missingness summary percent_missing = self.missing_summary(data) # if missingness summary is not empty, create the visualization if not percent_missing.empty: # optionally display DataFrame summary if display_df: display(percent_missing) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Percent missing by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=percent_missing.index, counts=percent_missing["Percent missing"], label_rotate=45 if len(percent_missing.index) <=5 else 90, color=color, y_units="p", x_tick_wrap=False, ax=ax, ) # if missingness summary is empty, just print "No Nulls" else: print("No nulls")
def eda_missing_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the percent of values missing for each feature. Optionally displays the underlying Pandas DataFrame. --- Parameters: training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color : str or color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame in addition to chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # dynamically choose training data objects or validation data objects data, _, mlm_dtypes = self.training_or_validation_dataset(training_data) # return missingness summary percent_missing = self.missing_summary(training_data) # if missingness summary is not empty, create the visualization if not percent_missing.empty: # optionally display DataFrame summary if display_df: display(percent_missing) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Percent missing by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=percent_missing.index, counts=percent_missing["Percent missing"], label_rotate=45 if len(percent_missing.index) <=5 else 90, color=color, y_units="p", x_tick_wrap=False, ax=ax, ) ax.set_ylim([0,100]) # if missingness summary is empty, just print "No Nulls" else: print("No nulls")
def eda_skew_summary(self, data=None, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the skew for each feature. Optionally displaying the underlying Pandas DataFrame. --- Parameters: data : Pandas DataFrame, default=None Pandas DataFrame containing independent variables. If left as none, the feature dataset provided to Machine during instantiation is used. color : str, color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame along with chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # use dataset provided during instantiation if None if data is None: data = self.data # return skewness summary skew_summary = self.skew_summary(data) # optionally display DataFrame summary if display_df: display(skew_summary) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Skew by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=skew_summary.index, counts=skew_summary["Skew"], label_rotate=45 if len(skew_summary.index) <=5 else 90, color=color, y_units="fff", x_tick_wrap=False, ax=ax, )
def eda_skew_summary(self, training_data=True, color=style.style_grey, display_df=False, chart_scale=15): """ Documentation: --- Description: Creates vertical bar chart visualizing the skew for each feature. Optionally displaying the underlying Pandas DataFrame. --- Parameters: training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color : str, color code, default=style.style_grey Bar color. display_df : boolean, default=False Controls whether to display summary data in Pandas DataFrame along with chart. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # dynamically choose training data objects or validation data objects data, _, mlm_dtypes = self.training_or_validation_dataset(training_data) # return skewness summary skew_summary = self.skew_summary(data) # optionally display DataFrame summary if display_df: display(skew_summary) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # add canvas to prettierplot object ax = p.make_canvas( title="Skew by feature", y_shift=0.8, title_scale=0.8, ) # add vertical bar chart to canvas p.bar_v( x=skew_summary.index, counts=skew_summary["Skew"], label_rotate=45 if len(skew_summary.index) <=5 else 90, color=color, y_units="fff", x_tick_wrap=False, ax=ax, )
def sample_plot(self, sample_space, n_iter, chart_scale=15): """ Documentation: --- Definition: Visualizes a single hyperopt theoretical distribution. Useful for helping to determine a distribution to use when setting up hyperopt distribution objects for actual parameter tuning. --- Parameters: sample_space : dictionary Dictionary of 'param name: hyperopt distribution object' key/value pairs. The name can be arbitrarily chosen, and the value is a defined hyperopt distribution. n_iter : int Number of iterations to draw from theoretical distribution in order to visualize the theoretical distribution. Higher number leads to more robust distribution but can take considerably longer to create. chart_scale : float, default=15 Controls proportions of visualizations. larger values scale visual up in size, smaller values scale visual down in size. """ # iterate through each parameter for param in sample_space.keys(): # sample from theoretical distribution for n_iters theoretical_dist = [] for _ in range(n_iter): theoretical_dist.append(sample(sample_space)[param]) theoretical_dist = np.array(theoretical_dist) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale) # add canvas to prettierplot object ax = p.make_canvas( title="actual vs. theoretical plot\n* {}".format(param), y_shift=0.8, position=111, ) # add kernel density plot to canvas p.kde_plot( theoretical_dist, color=style.style_grey, y_units="p", x_units="fff" if np.nanmax(theoretical_dist) <= 5.0 else "ff", ax=ax, )
def eda_transform_log1(self, data, name, chart_scale=15): """ Documentation: --- Description: Creates a two_panel visualization. The left plot is the log + 1 transformed distribution overlayed on a normal distribution. The right plot is a log + 1 adjusted qqplot overlayed across a straight line. --- Parameters: data : Pandas Series Target variable data object. name : str Name of target variable. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # create prettierplot object p = PrettierPlot(chart_scale=chart_scale) # add canvas to prettierplot object ax = p.make_canvas( title="dist/kde - {} (log+1)".format(name), x_label="", y_label="", y_shift=0.8, position=223, ) # add distribution / kernel density plot to canvas p.dist_plot( np.log1p(data), color=style.style_grey, fit=stats.norm, x_rotate=True, ax=ax ) # turn off x and y ticks plt.xticks([]) plt.yticks([]) # add canvas to prettierplot object ax = p.make_canvas( title="probability plot - {} (log+1)".format(name), x_label="", y_label="", y_shift=0.8, position=224, ) # add QQ / probability plot to canvas p.prob_plot(np.log1p(data), plot=ax) # turn off x and y ticks plt.xticks([]) plt.yticks([])
def eda_cat_target_num_feat(self, feature, color_map="viridis", outliers_out_of_scope=None, legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a number feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. outliers_out_of_scope : boolean, float or int, default=None Truncates the x-axis upper limit so that outliers are out of scope of the visualization. The x-axis upper limit is reset to the maximum non-outlier value. To identify outliers, the IQR is calculated, and values that are below the first quartile minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is passed, the IQR is multiplied by that value. Higher values increase how extremem values need to be to be identified as outliers. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ ### data summaries ## bivariate roll_up table # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # bivariate summary statistics bi_summ_stats_df = pd.DataFrame( columns=["Class", "Count", "Proportion", "Mean", "StdDev"]) # for each unique class label for labl in np.unique(self.target): # get feature values associated with single class label feature_slice = bi_df[bi_df[self.target.name] == labl][feature] # append summary statistics for feature values associated with class label bi_summ_stats_df = bi_summ_stats_df.append( { "Class": labl, "Count": len(feature_slice), "Proportion": len(feature_slice) / len(bi_df[feature]) * 100, "Mean": np.mean(feature_slice), "StdDev": np.std(feature_slice), }, ignore_index=True, ) # apply custom legend labels, or set dtype to int if column values are numeric if legend_labels is not None: bi_summ_stats_df["Class"] = legend_labels elif is_numeric_dtype(bi_summ_stats_df["Class"]): bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int) ## Feature summary describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add missing percentage describe_df = describe_df.append( { "index": "missing", feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5), }, ignore_index=True, ) # add skew describe_df = describe_df.append( { "index": "skew", feature: np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5), }, ignore_index=True, ) # add kurtosis describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # execute z-test or t-test if len(np.unique(self.target)) == 2: s1 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[0])][feature] s2 = bi_df[(bi_df[self.target.name] == bi_df[ self.target.name].unique()[1])][feature] if len(s1) > 30 and len(s2) > 30: # perform z-test, return z-statistic and p-value z, p_val = ztest(s1, s2) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) else: # perform t-test, return t-score and p-value t, p_val = stats.ttest_ind(s1, s2) # add t-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "t-test statistic": t, "p-value": p_val }], columns=["t-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Statistical test" ], ) else: # display summary tables self.df_side_by_side( dfs=(describe_df, bi_summ_stats_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard") # if boolean is passed to outliers_out_of_scope if isinstance(outliers_out_of_scope, bool): # if outliers_out_of_scope = True if outliers_out_of_scope: # identify outliers using IQR method and an IQR step of 5 outliers = self.outlier_IQR(self.data[feature], iqr_step=5) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # if outliers_out_of_scope is a float or int elif isinstance(outliers_out_of_scope, float) or isinstance( outliers_out_of_scope, int): # identify outliers using IQR method and an IQR step equal to the float/int passed outliers = self.outlier_IQR(self.data[feature], iqr_step=outliers_out_of_scope) # reset x-axis minimum and maximum x_axis_min = self.data[feature].drop(index=outliers).min() x_axis_max = self.data[feature].drop(index=outliers).max() # add canvas to prettierplot object ax = p.make_canvas( title="Feature distribution\n* {}".format(feature), title_scale=0.85, position=221, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units="f", x_units=x_units, ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Probability plot\n* {}".format(feature), title_scale=0.85, position=222, ) # add QQ / probability plot to canvas p.prob_plot( x=bi_df[feature].values, plot=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Distribution by class\n* {}".format(feature), title_scale=0.85, position=223, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # generate color list color_list = style.color_gen(name=color_map, num=len(np.unique(self.target))) # add one distribution plot to canvas for each category class for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)): p.dist_plot( bi_df[bi_df[self.target.name] == labl][feature].values, color=color_list[ix], y_units="f", x_units=x_units, legend_labels=legend_labels if legend_labels is not None else np.arange(len(np.unique(self.target))), alpha=0.4, bbox=(1.0, 1.0), ax=ax, ) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min, x_axis_max) # add canvas to prettierplot object ax = p.make_canvas( title="Boxplot by class\n* {}".format(feature), title_scale=0.85, position=224, ) ## dynamically determine precision of x-units # capture min and max feature values dist_min = bi_df[feature].values.min() dist_max = bi_df[feature].values.max() # determine x-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: x_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: x_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: x_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: x_units = "ff" else: x_units = "f" # add horizontal box plot to canvas p.box_plot_h(x=feature, y=self.target.name, data=bi_df, alpha=0.7, x_units=x_units, legend_labels=legend_labels, bbox=(1.2, 1.0), suppress_outliers=True, ax=ax) # optionally reset x-axis limits if outliers_out_of_scope is not None: plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max) # apply position adjustment to subplots plt.subplots_adjust(bottom=-0.1) plt.show()
def eda_cat_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", legend_labels=None, chart_scale=15): """ Documentation: --- Description: Creates exploratory data visualizations and statistical summaries for a category feature in the context of a categorical target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap, then no visualization panel is produced. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. legend_labels : list, default=None Class labels displayed in plot legend. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100, }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # groupby category feature and count the occurrences of target classes # for each level in category bi_summ_df = ( bi_df.groupby([feature] + [self.target.name]).size().reset_index().pivot( columns=self.target.name, index=feature, values=0)) # overwrite DataFrame index with actual class labels if provided bi_summ_df.columns = pd.Index( legend_labels) if legend_labels is not None else pd.Index( [i for i in bi_summ_df.columns.tolist()]) bi_summ_df.reset_index(inplace=True) # fill nan's with zero fill_columns = bi_summ_df.iloc[:, 2:].columns bi_summ_df[fill_columns] = bi_summ_df[fill_columns].fillna(0) # set values to int dtype where applicable to optimize displayed DataFrame for column in bi_summ_df.columns: try: bi_summ_df[column] = bi_summ_df[column].astype(np.int) except ValueError: bi_summ_df[column] = bi_summ_df[column] ## proportion by category summary # combine feature column and target prop_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls prop_df = prop_df[prop_df[feature].notnull()] # calculate percent of 100 by class label prop_df = prop_df.groupby([feature, self.target.name ]).agg({self.target.name: {"count"}}) prop_df = prop_df.groupby( level=0).apply(lambda x: 100 * x / float(x.sum())) prop_df = prop_df.reset_index() multiIndex = prop_df.columns singleIndex = [i[0] for i in multiIndex.tolist()] singleIndex[-1] = "Count" prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) prop_df = pd.pivot_table(prop_df, values=["Count"], columns=[feature], index=[self.target.name], aggfunc={"Count": np.mean}) prop_df = prop_df.reset_index(drop=True) multiIndex = prop_df.columns singleIndex = [] for column in multiIndex.tolist(): try: singleIndex.append(int(column[1])) except ValueError: singleIndex.append(column[1]) prop_df.columns = singleIndex prop_df = prop_df.reset_index(drop=True) # insert column to DataFrame with actual class labels if provided, otherwise use raw class labels in target prop_df.insert(loc=0, column="Class", value=legend_labels if legend_labels is not None else np.unique(self.target)) # fill nan's with zero fill_columns = prop_df.iloc[:, :].columns prop_df[fill_columns] = prop_df[fill_columns].fillna(0) # if there are only two class labels, perform z-test/t-test if len(np.unique(bi_df[bi_df[feature].notnull()][feature])) == 2: # total observations total_obs1 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[0])][feature].shape[0] total_obs2 = bi_df[(bi_df[feature] == np.unique( bi_df[feature])[1])][feature].shape[0] # total positive observations pos_obs1 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[0]) & (bi_df[self.target.name] == 1)][feature].shape[0] pos_obs2 = bi_df[(bi_df[feature] == np.unique(bi_df[feature])[1]) & (bi_df[self.target.name] == 1)][feature].shape[0] # perform z-test, return z-statistic and p-value z, p_val = proportions_ztest(count=(pos_obs1, pos_obs2), nobs=(total_obs1, total_obs2)) # add z-statistic and p-value to DataFrame stat_test_df = pd.DataFrame( data=[{ "z-test statistic": z, "p-value": p_val }], columns=["z-test statistic", "p-value"], index=[feature], ).round(4) # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df, stat_test_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion", "Statistical test", ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) else: # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_df, prop_df), names=[ "Feature summary", "Feature vs. target summary", "Target proportion" ], ) if "percent_positive" in bi_summ_df: bi_summ_df = bi_summ_df.drop(["percent_positive"], axis=1) ### visualizations # set label rotation angle len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 8: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 4: rotation = 0 else: rotation = 90 # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=0.82) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Category counts by target\n* {}".format(feature), position=132) # add faceted categorical plot to canvas p.facet_cat( df=bi_summ_df, feature=feature, label_rotate=rotation, color_map=color_map, bbox=(1.0, 1.15), alpha=0.8, legend_labels=legend_labels, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Target proportion by category\n* {}".format(feature), position=133) # add stacked bar chart to canvas p.stacked_bar_h( df=prop_df.drop("Class", axis=1), bbox=(1.0, 1.15), legend_labels=legend_labels, color_map=color_map, alpha=0.8, ax=ax, ) plt.show()
def binary_classification_panel(self, model, labels=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15, save_objects=False): """ Documentation: --- Description: Generate a panel of reports and visualizations summarizing the performance of a classification model. --- Parameters: model : model object Instantiated model object. labels : list, default=None Custom labels for confusion matrix axes. If left as none, will default to 0, 1, 2... color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.0 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. random_state : int, default=1 Random number seed. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. save_objects : boolean, default=False Controls whether visualizations and summary table are saved to the experiment directory. """ if not save_objects: print("*" * 55) print(f"* Estimator: {model.estimator_name}") print(f"* Parameter set: {model.model_iter}") print("*" * 55) print("\n" + "*" * 55) print("Training data evaluation\n") ## training data # fit model on training data and generate predictions using training data y_pred = model.fit(self.training_features, self.training_target).predict(self.training_features) # generate classification_report using training data report = classification_report( self.training_target, y_pred, target_names=labels if labels is not None else np.unique(self.training_target.values), output_dict=True, ) df = pd.DataFrame(report).transpose() # save or display classification report if save_objects: csv_path = os.path.join( self.evaluation_classification_report_object_dir, f"{model.estimator_name}_train_classification_report.csv") df.to_csv(csv_path, index=False) else: display(df) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title= f"Confusion matrix - training data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}", y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion plot to canvas plot_confusion_matrix( estimator=model, X=self.training_features, y_true=self.training_target, display_labels=labels if labels is not None else np.unique(self.training_target.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title= f"ROC curve - training data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}", x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=self.training_features, y_train=self.training_target, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) # save plots or show if save_objects: plot_path = os.path.join( self.evaluation_plots_object_dir, f"{model.estimator_name}_train_visualization.jpg") plt.tight_layout() plt.savefig(plot_path) plt.close() else: plt.show() ## validation data if not save_objects: print("\n" + "*" * 55) print("Validation data evaluation\n") # fit model on training data and generate predictions using validation data y_pred = model.fit(self.training_features, self.training_target).predict(self.validation_features) # generate classification_report using training data report = classification_report( self.validation_target, y_pred, target_names=labels if labels is not None else np.unique(self.training_target.values), output_dict=True, ) df = pd.DataFrame(report).transpose() # save or display classification report if save_objects: csv_path = os.path.join( self.evaluation_classification_report_object_dir, f"{model.estimator_name}_validation_classification_report.csv") df.to_csv(csv_path, index=False) else: display(df) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title= f"Confusion matrix - validation data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}", y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion matrix to canvas plot_confusion_matrix( estimator=model, X=self.validation_features, y_true=self.validation_target, display_labels=labels if labels is not None else np.unique(self.training_target.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title= f"ROC curve - validation data\nModel: {model.estimator_name}\nParameter set: {model.model_iter}", x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, # position=111 if X_valid is not None else 121, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=self.training_features, y_train=self.training_target, X_valid=self.validation_features, y_valid=self.validation_target, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) # save plots or show if save_objects: plot_path = os.path.join( self.evaluation_plots_object_dir, f"{model.estimator_name}_validation_visualization.jpg") plt.tight_layout() plt.savefig(plot_path) plt.close() else: plt.show()
def regression_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15): """ Documentation: Description: creates a set of residual plots and pandas DataFrames, where each row captures various summary statistics pertaining to a model's performance. generates residual plots and captures performance data for training and validation datasets. If no validation set is provided, then cross_validation is performed on the training dataset. Parameters: model : model object Instantiated model object. X_train : Pandas DataFrame Training data observations. y_train : Pandas Series Training target data. X_valid : Pandas DataFrame, default=None Validation data observations. y_valid : Pandas Series, default=None Validation target data. n_folds : int, default=None Number of cross-validation folds to use. If validation data is provided through X_valid/y_valid, n_folds is ignored. title_scale : float, default=1.0 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. random_state : int, default=1 Random number seed. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ print("*" * 55) print("* Estimator: {}".format(model.estimator_name)) print("* Parameter set: {}".format(model.model_iter)) print("*" * 55) print("\n" + "*" * 55) print("Training data evaluation") # fit model on training data model.fit(X_train.values, y_train.values) ## training dataset # generate predictions using training data and calculate residuals y_pred = model.predict(X_train.values) residuals = y_pred - y_train.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - training data\nModel: {}\nParameter set: {}". format( model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, title_scale=title_scale, position=121, ) # dynamically size precision of x-units based on magnitude of maximum # predicted values if -1 <= np.nanmax(y_pred) <= 1: x_units = "fff" elif -100 <= np.nanmax(y_pred) <= 100: x_units = "ff" else: x_units = "f" # dynamically size precision of y-units based on magnitude of maximum # predicted values if -0.1 <= np.nanmax(residuals) <= 0.1: y_units = "ffff" elif -1 <= np.nanmax(residuals) <= 1: y_units = "fff" elif -10 <= np.nanmax(residuals) <= 10: y_units = "ff" else: y_units = "f" # x tick label rotation if -10000 < np.nanmax(y_pred) < 10000: x_rotate = 0 else: x_rotate = 45 # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines(y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - training data\nModel: {}\nParameter set: {}". format( model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using training data and predictions results = self.regression_stats( model=model, y_true=y_train.values, y_pred=y_pred, feature_count=X_train.shape[1], ) # create shell results DataFrame and append regression_results_summary = pd.DataFrame(columns=list(results.keys())) regression_results_summary = regression_results_summary.append( results, ignore_index=True) ## validation dataset # if validation data is provided... if X_valid is not None: print("\n" + "*" * 55) print("Training data evaluation") # generate predictions with validation data and calculate residuals y_pred = model.predict(X_train.values) residuals = y_pred - y_train.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - training data\nModel: {}\nParameter set: {}" .format( model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, title_scale=title_scale, position=121, ) # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines(y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - training data\nModel: {}\nParameter set: {}" .format( model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using validation data and predictions results = self.regression_stats( model=model, y_true=y_train.values, y_pred=y_pred, feature_count=X_train.shape[1], data_type="validation", ) # append results to regression_results_summary regression_results_summary = regression_results_summary.append( results, ignore_index=True) display(regression_results_summary) # if n_folds are provided, indicating cross-validation elif isinstance(n_folds, int): # generate cross-validation indices cv = list( KFold(n_splits=n_folds, shuffle=True, random_state=random_state).split(X_train, y_train)) print("\n" + "*" * 55) print("Cross validation evaluation") # iterate through cross-validation indices for i, (train_ix, valid_ix) in enumerate(cv): X_train_cv = X_train.iloc[train_ix] y_train_cv = y_train.iloc[train_ix] X_valid_cv = X_train.iloc[valid_ix] y_valid_cv = y_train.iloc[valid_ix] # fit model on training data and generate predictions using holdout observations y_pred = model.fit(X_train_cv.values, y_train_cv.values).predict(X_valid_cv.values) # calculate residuals residuals = y_pred - y_valid_cv.values # create prettierplot object p = PrettierPlot(plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Residual plot - CV fold {}\nModel: {}\nParameter set: {}" .format( i + 1, model.estimator_name, model.model_iter, ), x_label="Predicted values", y_label="Residuals", y_shift=0.55, position=121, title_scale=title_scale, ) # add 2-dimensional scatter plot to canvas p.scatter_2d( x=y_pred, y=residuals, size=7, color=style.style_grey, # color=color_list[i], y_units=y_units, x_units=x_units, ax=ax, ) # plot horizontal line at y=0 plt.hlines( y=0, xmin=np.min(y_pred), xmax=np.max(y_pred), color=style.style_grey, lw=2, ) # add canvas to prettierplot object ax = p.make_canvas( title= "Residual distribution - CV fold {}\nModel: {}\nParameter set: {}" .format( i + 1, model.estimator_name, model.model_iter, ), title_scale=title_scale, position=122, ) # add distribution plot to canvas p.dist_plot( residuals, fit=stats.norm, color=style.style_grey, y_units="ff", x_units="fff", ax=ax, ) plt.show() # generate regression_stats using holdout observations and predictions results = self.regression_stats( model=model, y_true=y_valid_cv, y_pred=y_pred, feature_count=X_valid_cv.shape[1], data_type="validation", fold=i + 1, ) # append results to regression_results_summary regression_results_summary = regression_results_summary.append( results, ignore_index=True) print("\n" + "*" * 55) print("Summary") display(regression_results_summary) else: display(regression_results_summary)
from prettierplot.plotter import PrettierPlot from prettierplot import data import numpy as np df = data.attrition() # capture unique EmployeeField values and frequency counts unique_vals, unique_counts = np.unique( df[df["EducationField"].notnull()]["EducationField"], return_counts=True) # create plotting instance p = PrettierPlot(chart_scale=10) # create Axes object and decorate ax = p.make_canvas(title="Educational field category counts", y_label="Category counts", y_shift=0.47) # add plots p.bar_v(x=unique_vals, counts=unique_counts, label_rotate=45, x_tick_wrap=True)
def eda_num_target_num_feat(self, feature, training_data=True, color_map="viridis", chart_scale=15, save_plots=False): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a numeric feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. training_data : boolean, dafault=True Controls which dataset (training or validation) is used for visualization. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. save_plots : boolean, default=False Controls whether model loss plot imgaes are saved to the experiment directory. """ # dynamically choose training data objects or validation data objects data, target, mlm_dtypes = self.training_or_validation_dataset( training_data) ### data summaries ## feature summary # combine feature column and target bi_df = pd.concat([data[feature], target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[target.name] = bi_df[target.name].astype(float) # create summary statistic table describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add skew and kurtosis to describe_df describe_df = describe_df.append( { "index": "skew", feature: stats.skew(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # display summary tables display(describe_df) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title=f"Feature distribution\n* {feature}", position=131, title_scale=1.2) # determine x-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: x_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: x_units = "ff" else: x_units = "f" # determine y-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: y_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: y_units = "ff" else: y_units = "f" # x rotation if -10000 < np.nanmax(bi_df[feature].values) < 10000: x_rotate = 0 else: x_rotate = 45 # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units=y_units, x_rotate=x_rotate, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132) # add QQ / probability plot to canvas p.prob_plot(x=bi_df[feature].values, plot=ax) # add canvas to prettierplot object ax = p.make_canvas( title=f"Regression plot - feature vs. target\n* {feature}", position=133, title_scale=1.5) # add regression plot to canvas p.reg_plot( x=feature, y=target.name, data=bi_df, x_jitter=0.1, x_rotate=x_rotate, x_units=x_units, y_units=y_units, ax=ax, ) # save plots or show if save_plots: plot_path = os.path.join( self.eda_object_dir, f"{feature}.jpg".replace("/", ""), ) plt.tight_layout() plt.savefig(plot_path) plt.close() else: plt.show()
def model_loss_plot(self, bayes_optim_summary, estimator_class, chart_scale=15, trim_outliers=True, outlier_control=1.5, title_scale=0.7, color_map="viridis"): """ Documentation: --- Definition: Visualize how the bayesian optimization loss changes over time across all iterations. Extremely poor results are removed from visualized dataset by two filters. 1) Loss values worse than [loss mean + (2 x loss standard deviation)] 2) Loss values worse than [median * outliers_control]. 'outlier_control' is a parameter that can be set during function execution. --- Parameters: bayes_optim_summary : Pandas DataFrame Pandas DataFrame containing results from bayesian optimization process. estimator_class : str or sklearn api object Name of estimator to visualize. chart_scale : float, default=15 Control chart proportions. Higher values scale up size of chart objects, lower values scale down size of chart objects. trim_outliers : boolean, default=True Remove extremely high (poor) results by trimming values where the loss is greater than 2 standard deviations away from the mean. outlier_control : float: default=1.5 Controls enforcement of outlier trimming. Value is multiplied by median, and the resulting product is the cap placed on loss values. Values higher than this cap will be excluded. Lower values of outlier_control apply more extreme filtering to loss values. title_scale : float, default=0.7 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. """ # unpack bayes_optim_summary parameters for an estimator_class estimator_summary = self.unpack_bayes_optim_summary( bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class ) # apply outlier trimming if trim_outliers: mean = estimator_summary["iter_loss"].mean() median = estimator_summary["iter_loss"].median() std = estimator_summary["iter_loss"].std() cap = mean + (2.0 * std) estimator_summary = estimator_summary[ (estimator_summary["iter_loss"] < cap) & (estimator_summary["iter_loss"] < outlier_control * median) ] # create color list based on color_map color_list = style.color_gen(name=color_map, num=3) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale) # add canvas to prettierplot object ax = p.make_canvas( title="Loss by iteration - {}".format(estimator_class), y_shift=0.8, position=111, title_scale=title_scale, ) # add regression plot to canvas p.reg_plot( x="iteration", y="iter_loss", data=estimator_summary, y_units="ffff", line_color=color_list[0], dot_color=color_list[1], alpha=0.6, line_width=0.4, dot_size=10.0, ax=ax, ) plt.show()
def model_param_plot(self, bayes_optim_summary, estimator_class, estimator_parameter_space, n_iter, chart_scale=15, color_map="viridis", title_scale=1.2, show_single_str_params=False): """ Documentation: --- Definition: Visualize hyperparameter optimization over all iterations. Compares theoretical distribution to the distribution of values that were actually chosen, and visualizes how parameter value selections changes over time. --- Parameters: bayes_optim_summary : Pandas DataFrame Pandas DataFrame containing results from bayesian optimization process. estimator_class : str or sklearn api object Name of estimator to visualize. estimator_parameter_space : dictionary of dictionaries Dictionary of nested dictionaries. Outer key is an estimator, and the corresponding value is a dictionary. Each nested dictionary contains 'parameter: value distribution' key/value pairs. The inner dictionary key specifies the parameter of the model to be tuned, and the value is a distribution of values from which trial values are drawn. n_iter : int Number of iterations to draw from theoretical distribution in order to visualize the theoretical distribution. Higher number leader to more robust distribution but can take considerably longer to create. chart_scale : float, default=15 Controls proportions of visualizations. larger values scale visual up in size, smaller values scale visual down in size. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.2 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. show_single_str_params : boolean, default=False Controls whether to display visuals for string attributes where there is only one unique value, i.e. there was only one choice for the optimization procedure to choose from during each iteration. """ # unpack bayes_optim_summary parameters for an estimator_class estimator_summary = self.unpack_bayes_optim_summary( bayes_optim_summary=bayes_optim_summary, estimator_class=estimator_class ) # override None with string representation estimator_summary = estimator_summary.replace([None], "None") # subset estimator_parameter_space to space for the specified estimator_class estimator_space = estimator_parameter_space[estimator_class] print("*" * 100) print("* {}".format(estimator_class)) print("*" * 100) # iterate through each parameter for param in estimator_space.keys(): # sample from theoretical distribution for n_iters theoretical_dist = [] for _ in range(n_iter): theoretical_dist.append(sample(estimator_space)[param]) ## override None with string representation # theoretical distribution theoretical_dist = ["none" if v is None else v for v in theoretical_dist] theoretical_dist = np.array(theoretical_dist) # actual distribution actual_dist = estimator_summary[param].tolist() actual_dist = ["none" if v is None else v for v in actual_dist] actual_dist = np.array(actual_dist) # limit estimator_summary to "iteration" and current "param" columns actual_iter_df = estimator_summary[["iteration", param]] # identify how many values in param column are zero or one zeros_and_ones = (actual_iter_df[param].eq(True) | actual_iter_df[param].eq(False)).sum() # param column only contains zeros and ones, store string representations of "TRUE" and "FALSE" if zeros_and_ones == actual_iter_df.shape[0]: actual_iter_df = actual_iter_df.replace({True: "TRUE", False: "FALSE"}) # if theoreitcal distribution has dtype -- np.bool_, store string representations of "TRUE" and "FALSE" if isinstance(theoretical_dist[0], np.bool_): theoretical_dist = np.array(["TRUE" if i == True else "FALSE" for i in theoretical_dist.tolist()]) estimator_summary = estimator_summary.replace([True], "TRUE") estimator_summary = estimator_summary.replace([False], "FALSE") # if theoretical distribution contains str data, then treat this as an object/category parameter if any(isinstance(d, str) for d in theoretical_dist): # generate color list for stripplot stripplot_color_list = style.color_gen(name=color_map, num=len(actual_iter_df[param].unique()) + 1) # generate color list for bar chart bar_color_list = style.color_gen(name=color_map, num=3) # identify unique values and associated count in theoretical distribution unique_vals_theo, unique_counts_theo = np.unique(theoretical_dist, return_counts=True) # if theoretical distribution only has one unique value and show_single_str_params is set to True if len(unique_vals_theo) > 1 or show_single_str_params: # identify unique values and associated count in actual distribution unique_vals_actual, unique_counts_actual = np.unique(actual_dist, return_counts=True) # store data in DataFrame df = pd.DataFrame({"param": unique_vals_actual, "Theorical": unique_counts_theo, "Actual": unique_counts_actual}) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # add faceted bar chart to canvas p.facet_cat( df=df, feature="param", color_map=bar_color_list[:-1], bbox=(1.0, 1.15), alpha=1.0, legend_labels=df.columns[1:].values, x_units=None, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.5, position=122, title_scale=title_scale, ) # add stripply to canvas sns.stripplot( x="iteration", y=param, data=estimator_summary, jitter=0.3, alpha=1.0, size=0.7 * chart_scale, palette=sns.color_palette(stripplot_color_list[:-1]), ax=ax, ).set(xlabel=None, ylabel=None) # set tick label font size ax.tick_params(axis="both", colors=style.style_grey, labelsize=1.2 * chart_scale) plt.show() # otherwise treat it as a numeric parameter else: # cast "iteration" as an int and the param values as float convert_dict = {"iteration": int, param: float} actual_iter_df = actual_iter_df.astype(convert_dict) # create color map color_list = style.color_gen(name=color_map, num=3) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation = "wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Selection vs. theoretical distribution\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=121, title_scale=title_scale, ) # dynamically set x-unit precision based on max value if -1.0 <= np.nanmax(theoretical_dist) <= 1.0: x_units = "fff" elif 1.0 < np.nanmax(theoretical_dist) <= 5.0: x_units = "ff" elif np.nanmax(theoretical_dist) > 5.0: x_units = "f" # add kernsel density plot for theoretical distribution to canvas p.kde_plot( theoretical_dist, color=color_list[0], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) # add kernsel density plot for actual distribution to canvas p.kde_plot( actual_dist, color=color_list[1], y_units="ffff", x_units=x_units, line_width=0.4, bw=0.4, ax=ax, ) ## create custom legend # create labels label_color = {} legend_labels = ["Theoretical", "Actual"] for ix, i in enumerate(legend_labels): label_color[i] = color_list[ix] # create legend Patches Patches = [Patch(color=v, label=k, alpha=1.0) for k, v in label_color.items()] # draw legend leg = plt.legend( handles=Patches, fontsize=1.1 * chart_scale, loc="upper right", markerscale=0.6 * chart_scale, ncol=1, bbox_to_anchor=(.95, 1.1), ) # label font color for text in leg.get_texts(): plt.setp(text, color="grey") # dynamically set y-unit precision based on max value if -1.0 <= np.nanmax(actual_iter_df[param]) <= 1.0: y_units = "fff" elif 1.0 < np.nanmax(actual_iter_df[param]) <= 5.0: y_units = "ff" elif np.nanmax(actual_iter_df[param]) > 5.0: y_units = "f" # add canvas to prettierplot object ax = p.make_canvas( title="Selection by iteration\n* {0} - {1}".format(estimator_class, param), y_shift=0.8, position=122, title_scale=title_scale, ) # add regression plot to canvas p.reg_plot( x="iteration", y=param, data=actual_iter_df, y_units=y_units, x_units="f", line_color=color_list[0], line_width=0.4, dot_color=color_list[1], dot_size=10.0, alpha=0.6, ax=ax ) plt.show()
def eda_num_target_num_feat(self, feature, color_map="viridis", chart_scale=15): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a numeric feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ ### data summaries ## feature summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[self.target.name] = bi_df[self.target.name].astype(float) # create summary statistic table describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index() # add skew and kurtosis to describe_df describe_df = describe_df.append( { "index": "skew", feature: stats.skew(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.append( { "index": "kurtosis", feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"), }, ignore_index=True, ) describe_df = describe_df.rename(columns={"index": ""}) # display summary tables display(describe_df) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Feature distribution\n* {}".format(feature), position=131, title_scale=1.2) # determine x-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: x_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: x_units = "ff" else: x_units = "f" # determine y-units precision based on magnitude of max value if -1 <= np.nanmax(bi_df[feature].values) <= 1: y_units = "fff" elif -10 <= np.nanmax(bi_df[feature].values) <= 10: y_units = "ff" else: y_units = "f" # x rotation if -10000 < np.nanmax(bi_df[feature].values) < 10000: x_rotate = 0 else: x_rotate = 45 # add distribution plot to canvas p.dist_plot( bi_df[feature].values, color=style.style_grey, y_units=y_units, x_rotate=x_rotate, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title="Probability plot\n* {}".format(feature), position=132) # add QQ / probability plot to canvas p.prob_plot(x=bi_df[feature].values, plot=ax) # add canvas to prettierplot object ax = p.make_canvas( title="Regression plot - feature vs. target\n* {}".format(feature), position=133, title_scale=1.5) # add regression plot to canvas p.reg_plot( x=feature, y=self.target.name, data=bi_df, x_jitter=0.1, x_rotate=x_rotate, x_units=x_units, y_units=y_units, ax=ax, ) plt.show()
def eda_num_target_cat_feat(self, feature, level_count_cap=50, color_map="viridis", chart_scale=15): """ Documentation: --- Description: Produces exploratory data visualizations and statistical summaries for a category feature in the context of a numeric target. --- Parameters: feature : str Feature to visualize. level_count_cap : int, default=50 Maximum number of unique levels in feature. If the number of levels exceeds the cap then the feature is skipped. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ # if number of unique levels in feature is less than specified level_count_cap if (len(np.unique(self.data[self.data[feature].notnull()][feature].values)) < level_count_cap): ### data summaries ## feature summary # create empty DataFrame uni_summ_df = pd.DataFrame(columns=[feature, "Count", "Proportion"]) # capture unique values and count of those unique values unique_vals, unique_counts = np.unique( self.data[self.data[feature].notnull()][feature], return_counts=True) # append each unique value, count and proportion to DataFrame for i, j in zip(unique_vals, unique_counts): uni_summ_df = uni_summ_df.append( { feature: i, "Count": j, "Proportion": j / np.sum(unique_counts) * 100 }, ignore_index=True, ) # sort DataFrame by "Proportion", descending uni_summ_df = uni_summ_df.sort_values(by=["Proportion"], ascending=False) # set values to int dtype where applicable to optimize if is_numeric_dtype(uni_summ_df[feature]): uni_summ_df[feature] = uni_summ_df[feature].astype("int64") uni_summ_df["Count"] = uni_summ_df["Count"].astype("int64") ## feature vs. target summary # combine feature column and target bi_df = pd.concat([self.data[feature], self.target], axis=1) # remove any rows with nulls bi_df = bi_df[bi_df[feature].notnull()] # cast target as float bi_df[self.target.name] = bi_df[self.target.name].astype(float) # create pivot table of target summary statistics, grouping by category feature bi_summ_piv_df = pd.pivot_table( bi_df, index=feature, aggfunc={ self.target.name: [np.nanmin, np.nanmax, np.nanmean, np.nanmedian, np.nanstd] }) multi_index = bi_summ_piv_df.columns single_index = pd.Index([i[1] for i in multi_index.tolist()]) bi_summ_piv_df.columns = single_index bi_summ_piv_df.reset_index(inplace=True) bi_summ_piv_df = bi_summ_piv_df.rename( columns={ "nanmin": "Min", "nanmax": "Max", "nanmean": "Mean", "nanmedian": "Median", "nanstd": "StdDev", }) # fill nan's with zero fill_columns = bi_summ_piv_df.iloc[:, 1:].columns bi_summ_piv_df[fill_columns] = bi_summ_piv_df[fill_columns].fillna(0) # reorder column bi_summ_piv_df = bi_summ_piv_df[[ feature, "Mean", "Median", "StdDev", "Min", "Max" ]] # convert to int if is_numeric_dtype(bi_summ_piv_df[feature]): bi_summ_piv_df[feature] = bi_summ_piv_df[feature].astype("int64") # display summary tables self.df_side_by_side( dfs=(uni_summ_df, bi_summ_piv_df), names=["Feature summary", "Feature vs. target summary"], ) ### visualizations # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas(title="Category counts\n* {}".format(feature), position=131, title_scale=1.0) # add treemap to canvas p.tree_map( counts=uni_summ_df["Count"].values, labels=uni_summ_df[feature].values, colors=style.color_gen(name=color_map, num=len(uni_summ_df[feature].values)), alpha=0.8, ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas(title="Feature distribution\n* {}".format(feature), position=132) # error catching block for resorting labels try: sorted(unique_vals, key=int) except ValueError: pass else: # sort unique_vals/unique_counts for bar chart new_ix = [ sorted(list(unique_vals), key=int).index(i) for i in list(unique_vals) ] unique_vals = np.array(sorted(list(unique_vals), key=int)) unique_counts = np.array( [y for x, y in sorted(zip(new_ix, unique_counts))]) # sort temporary data frame for box plot bi_df[feature] = bi_df[feature].astype(int) # dynamically set rotation angle based on number unique values and maximum length of # category labels. len_unique_val = len(unique_vals) avg_len_unique_val = sum(map(len, str(unique_vals))) / len(unique_vals) if len_unique_val <= 4 and avg_len_unique_val <= 12: rotation = 0 elif len_unique_val >= 5 and len_unique_val <= 8 and avg_len_unique_val <= 7.0: rotation = 0 elif len_unique_val >= 9 and len_unique_val <= 14 and avg_len_unique_val <= 6: rotation = 0 else: rotation = 30 # represent x-axis tick labels as integers rather than floats x_values = list(map(str, unique_vals.tolist())) try: x_values = [int(float(x)) for x in x_values] except ValueError: pass # add bar chart to canvas p.bar_v( x=x_values, counts=unique_counts, label_rotate=rotation, color=style.style_grey, y_units="f", x_tick_wrap=True, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] # add canvas to prettierplot object ax = p.make_canvas(title="Boxplot by category\n* {}".format(feature), position=133) ## dynamically determine precision of y-units # capture min and max feature values dist_min = bi_df[self.target.name].values.min() dist_max = bi_df[self.target.name].values.max() # determine y-units precision based on min and max values in feature if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10: y_units = "fff" elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3: y_units = "fff" elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10: y_units = "ff" elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5: y_units = "ff" else: y_units = "f" # add vertical box plot to canvas p.box_plot_v( x=feature, y=self.target.name, data=bi_df.sort_values([feature]), color=matplotlib.cm.get_cmap(name=color_map), label_rotate=rotation, y_units=y_units, ax=ax, ) # hide every other label if total number of levels is greater than 40 if len_unique_val > 40: n = 2 [ l.set_visible(False) for (i, l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0 ] plt.show()
def binary_classification_panel(self, model, X_train, y_train, X_valid=None, y_valid=None, labels=None, n_folds=None, title_scale=1.0, color_map="viridis", random_state=1, chart_scale=15): """ Documentation: --- Description: Generate a panel of reports and visualizations summarizing the performance of a classification model. --- Parameters: model : model object Instantiated model object. X_train : Pandas DataFrame Training data observations. y_train : Pandas Series Training target data. X_valid : Pandas DataFrame, default=None Validation data observations. y_valid : Pandas Series, default=None Validation target data. labels : list, default=None Custom labels for confusion matrix axes. If left as none, will default to 0, 1, 2... n_folds : int, default=None Number of cross-validation folds to use. If validation data is provided through X_valid/y_valid, n_folds is ignored. color_map : str specifying built-in matplotlib colormap, default="viridis" Color map applied to plots. title_scale : float, default=1.0 Controls the scaling up (higher value) and scaling down (lower value) of the size of the main chart title, the x_axis title and the y_axis title. random_state : int, default=1 Random number seed. chart_scale : int or float, default=15 Controls size and proportions of chart and chart elements. Higher value creates larger plots and increases visual elements proportionally. """ print("*" * 55) print("* Estimator: {}".format(model.estimator_name)) print("* Parameter set: {}".format(model.model_iter)) print("*" * 55) print("\n" + "*" * 55) print("Training data evaluation\n") ## training panel # fit model on training data and generate predictions using training data y_pred = model.fit(X_train, y_train).predict(X_train) # print and generate classification_report using training data print( classification_report( y_train, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - training data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion plot to canvas plot_confusion_matrix( estimator=model, X=X_train, y_true=y_train, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - training data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train, y_train=y_train, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show() # if validation data is provided if X_valid is not None: print("\n" + "*" * 55) print("Validation data evaluation\n") # fit model on training data and generate predictions using validation data y_pred = model.fit(X_train, y_train).predict(X_valid) # print and generate classification_report using training data print( classification_report( y_valid, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - validation data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion matrix to canvas plot_confusion_matrix( estimator=model, X=X_valid, y_true=y_valid, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - validation data\nModel: {}\nParameter set: {}".format( model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, # position=111 if X_valid is not None else 121, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show() # if n_folds are provided, indicating cross-validation elif isinstance(n_folds, int): print("\n" + "*" * 55) print("Cross validation evaluation\n") # generate cross-validation indices cv = list( StratifiedKFold( n_splits=n_folds, shuffle=True, random_state=random_state ).split(X_train, y_train) ) # generate colors color_list = style.color_gen(color_map, num=len(cv)) # iterate through cross-validation indices for i, (train_ix, valid_ix) in enumerate(cv): print("\n" + "*" * 55) print("CV Fold {}\n".format(i + 1)) X_train_cv = X_train.iloc[train_ix] y_train_cv = y_train.iloc[train_ix] X_valid_cv = X_train.iloc[valid_ix] y_valid_cv = y_train.iloc[valid_ix] # fit model on training data and generate predictions using holdout observations y_pred = model.fit(X_train_cv, y_train_cv).predict(X_valid_cv) # print and generate classification_report using holdout observations print( classification_report( y_valid_cv, y_pred, target_names=labels if labels is not None else np.unique(y_train.values), ) ) # create prettierplot object p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow") # add canvas to prettierplot object ax = p.make_canvas( title="Confusion matrix - CV Fold {}\nModel: {}\nParameter set: {}".format( i + 1, model.estimator_name, model.model_iter ), y_shift=0.4, x_shift=0.25, position=121, title_scale=title_scale, ) # add confusion matrix to canvas plot_confusion_matrix( estimator=model, X=X_valid_cv, y_true=y_valid_cv, display_labels=labels if labels is not None else np.unique(y_train.values), cmap=color_map, values_format=".0f", ax=ax, ) # add canvas to prettierplot object ax = p.make_canvas( title="ROC curve - CV Fold {}\nModel: {}\nParameter set: {}".format( i + 1, model.estimator_name, model.model_iter, ), x_label="False positive rate", y_label="True positive rate", y_shift=0.35, position=122, title_scale=title_scale, ) # add ROC curve to canvas p.roc_curve_plot( model=model, X_train=X_train_cv, y_train=y_train_cv, X_valid=X_valid_cv, y_valid=y_valid_cv, linecolor=style.style_grey, ax=ax, ) plt.subplots_adjust(wspace=0.3) plt.show()