def test_plot_bar_returns_a_bar_plot_with_correct_elements(self): plot_data = TESTDATA_1 x_var = 'snapshot_date' y_var = 'pos_count' title = '# positive examples over time' subplot_index = 0 _, axes = plt.subplots(nrows=2, ncols=1) viz_utils.plot_bar(plot_data=plot_data, x_variable=x_var, y_variable=y_var, title=title, axes=axes, subplot_index=subplot_index) bar_plot = axes[subplot_index] x_data = list(plot_data[x_var]) y_data = [float(y) for y in list(plot_data[y_var])] with self.subTest(name='test x axis variable is equal'): self.assertEqual(x_var, bar_plot.get_xlabel()) with self.subTest(name='test x axis data is equal'): self.assertListEqual(x_data, [ tick.get_text() for tick in bar_plot.get_xticklabels(which='major') ]) with self.subTest(name='test y axis variable is equal'): self.assertEqual(y_var, bar_plot.get_ylabel()) with self.subTest(name='test y axis data is equal'): self.assertListEqual(y_data, [h.get_height() for h in bar_plot.patches]) with self.subTest(name='test title is equal'): self.assertEqual(title, bar_plot.get_title())
def _plot_categorical_fact( plot_data: pd.DataFrame, fact_name: str, plot_style_params: _FactPlotStyles) -> List[axes.Axes]: """Plots the statistics of a categorical fact variable. Generates plots of daily record count, latest distribution of top N levels and daily distribution of top N level over time of a categorical fact variable. Args: plot_data: Data to plot containing date, total_count, value and percentage columns. fact_name: Name of the fact variable. plot_style_params: Plot style parameters. Returns: plots: A list of Axes containing 3 plots. """ logging.info('Plotting categorical fact %s ', fact_name) _, plots = pyplot.subplots(nrows=_ROWS_IN_SUBPLOTS_GRID, ncols=_COLS_IN_SUBPLOTS_GRID, figsize=(plot_style_params.fig_width, plot_style_params.fig_height)) latest_date = max(plot_data['date']) latest_date_stats = plot_data[plot_data['date'] == latest_date].sort_values(by=['percentage'], ascending=False) common_lineplot_params = { 'axes': plots, 'title_fontsize': plot_style_params.lineplot_title_fontsize, 'xticklabels_fontsize': plot_style_params.lineplot_xticklabels_fontsize, 'yticklabels_fontsize': plot_style_params.lineplot_yticklabels_fontsize } # plot daily total fact count viz_utils.plot_line(plot_data=plot_data[['date', 'total_record_count' ]].drop_duplicates(), x_variable='date', y_variable='total_record_count', title=f'{fact_name} - Daily Fact Count', subplot_index=0, line_color=plot_style_params.line_color_record_count, **common_lineplot_params) # plot the latest distribution of the top N fact levels. viz_utils.plot_bar( plot_data=latest_date_stats, x_variable='category_value', y_variable='percentage', title=f'{fact_name} - Latest Value Distribution (%)', axes=plots, subplot_index=1, title_fontsize=plot_style_params.barplot_title_fontsize, xticklabels_fontsize=plot_style_params.barplot_xticklabels_fontsize, yticklabels_fontsize=plot_style_params.barplot_yticklabels_fontsize) # plot the daily distribution of the top N fact levels over time. viz_utils.plot_line( plot_data=plot_data, x_variable='date', y_variable='percentage', title=f'{fact_name} - Daily value distribution (%)', subplot_index=2, category_variable='category_value', legend_fontsize=plot_style_params.lineplot_legend_fontsize, **common_lineplot_params) return plots
def _plot_categorical_feature_numerical_label( label_stats_data: pd.DataFrame, feature_stats_data: pd.DataFrame, feature_name: str, plot_style_params: _FeaturePlotStyles) -> List[axes.Axes]: """Plots the statistics of a categorical feature when label is numerical. Generates following plots: - distribution of the label (box plots) for different category values of the feature. - distribution of feature values (stacked bor plots) by snapshot_date. Args: label_stats_data: Plot data containing the following columns: feature, value, mean, stddev, med, q1, q3, whislo and whishi. feature_stats_data: Plot data containing the following columns: snapshot_date, record_count, prop_missing, prop_non_num, average and stddev. feature_name: Name of the feature. plot_style_params: Plot style parameters. Returns: plots: A list of Axes containing 2 plots. """ logging.info('Plotting categorical feature %s', feature_name) _, plots = pyplot.subplots(nrows=_NO_PLOTS_CAT_FEATURE_NUM_LABEL, ncols=_COLS_IN_SUBPLOTS_GRID, figsize=(plot_style_params.fig_width, plot_style_params.fig_height)) # Plot distribution of the label by different feature values (categories) logging.info('Plotting label distribution by feature category values.') viz_utils.plot_box( plot_data=label_stats_data, title=f'Label distribution by [{feature_name}] categories', axes=plots, subplot_index=0, x_variable='value', x_label='Category value', y_label='Label distribution', title_fontsize=plot_style_params.title_fontsize, xlabel_fontsize=plot_style_params.xlabel_fontsize, ylabel_fontsize=plot_style_params.ylabel_fontsize, xticklabels_fontsize=plot_style_params.xticklabels_fontsize, yticklabels_fontsize=plot_style_params.yticklabels_fontsize, xticklabels_rotation=0) # Plot snapshot-level feature distribution viz_utils.plot_bar( plot_data=feature_stats_data, x_variable='snapshot_date', y_variable='percentage', group_variable='value', stacked_bars=True, title=f'Snapshot-level distribution of [{feature_name}]', subplot_index=1, axes=plots, title_fontsize=plot_style_params.title_fontsize, xlabel_fontsize=plot_style_params.xlabel_fontsize, ylabel_fontsize=plot_style_params.ylabel_fontsize, xticklabels_fontsize=plot_style_params.xticklabels_fontsize, yticklabels_fontsize=plot_style_params.yticklabels_fontsize, xticklabels_rotation=45, ) return plots
def _plot_categorical_feature_binary_label( df_data: pd.DataFrame, feature_name: str, label_column: str, positive_class_label: LabelType, negative_class_label: LabelType, plot_style_params: _FeaturePlotStyles) -> List[axes.Axes]: """Plots the statistics of a categorical feature when label is binary. Generates following plots of the feature: - distribution of values (bor plots) by label. - distribution of values (stacked bor plots) for positive instances by snapshot_date. - distribution of values (stacked bor plots) for negative instances by snapshot_date. Args: df_data: plot data containing the following columns: snapshot_date, label, record_count, prop_missing, prop_non_num, average, stddev columns. feature_name: Name of the feature. label_column: Name of the label column. positive_class_label: label for positive class negative_class_label: label for negative class plot_style_params: Plot style parameters. Returns: plots: A list of Axes containing 3 plots. """ logging.info('Plotting categorical feature %s', feature_name) _, plots = pyplot.subplots(nrows=_NO_PLOTS_CAT_FEATURE_BINARY_LABEL, ncols=_COLS_IN_SUBPLOTS_GRID, figsize=(plot_style_params.fig_width, plot_style_params.fig_height)) # Aggregating dataframe on date level to get data for the category # distribution plot. df_value_count = df_data.groupby([label_column, 'value'])[['count']].sum().reset_index() df_total_count = df_data.groupby(label_column)[['count' ]].sum().reset_index() df_total_count = df_total_count.rename(columns={'count': 'total_count'}) # Joining total counts and calculating proportions. df_value_proportions = df_value_count.merge(df_total_count, on=label_column) df_value_proportions['percentage'] = ( df_value_proportions['count'] / df_value_proportions['total_count']) * 100 common_barplot_params = { 'axes': plots, 'title_fontsize': plot_style_params.title_fontsize, 'xlabel_fontsize': plot_style_params.xlabel_fontsize, 'ylabel_fontsize': plot_style_params.ylabel_fontsize, 'xticklabels_fontsize': plot_style_params.xticklabels_fontsize, 'yticklabels_fontsize': plot_style_params.yticklabels_fontsize } # Plot distribution of the feature values by label viz_utils.plot_bar(plot_data=df_value_proportions, x_variable='value', y_variable='percentage', group_variable=label_column, title=f'Distribution of [{feature_name}]', subplot_index=0, **common_barplot_params) # Plot the snapshot-level distribution of the feature for positive instances pos_instance_stats = df_data[df_data[label_column] == positive_class_label] pos_instance_stats = pos_instance_stats.sort_values( ['snapshot_date', 'feature'], ascending=True) pos_plot_title = (f'Snapshot-level distribution of [{feature_name}] for ' f'label = {positive_class_label}') viz_utils.plot_bar( plot_data=pos_instance_stats, x_variable='snapshot_date', y_variable='percentage', group_variable='value', stacked_bars=True, title=pos_plot_title, subplot_index=1, xticklabels_rotation=45, x_label='', # to better arrange the output plots **common_barplot_params) # Plot the snapshot-level distribution of the feature for negative instances neg_instance_stats = df_data[df_data[label_column] == negative_class_label] neg_instance_stats = neg_instance_stats.sort_values( ['snapshot_date', 'feature'], ascending=True) neg_plot_title = (f'Snapshot-level distribution of [{feature_name}] for ' f'label = {negative_class_label}') viz_utils.plot_bar( plot_data=neg_instance_stats, x_variable='snapshot_date', y_variable='percentage', group_variable='value', stacked_bars=True, title=neg_plot_title, subplot_index=2, xticklabels_rotation=45, **common_barplot_params, ) return plots