コード例 #1
0
    def test_plot_bar_returns_a_bar_plot_with_correct_elements(self):
        plot_data = TESTDATA_1
        x_var = 'snapshot_date'
        y_var = 'pos_count'
        title = '# positive examples over time'
        subplot_index = 0

        _, axes = plt.subplots(nrows=2, ncols=1)
        viz_utils.plot_bar(plot_data=plot_data,
                           x_variable=x_var,
                           y_variable=y_var,
                           title=title,
                           axes=axes,
                           subplot_index=subplot_index)

        bar_plot = axes[subplot_index]
        x_data = list(plot_data[x_var])
        y_data = [float(y) for y in list(plot_data[y_var])]

        with self.subTest(name='test x axis variable is equal'):
            self.assertEqual(x_var, bar_plot.get_xlabel())
        with self.subTest(name='test x axis data is equal'):
            self.assertListEqual(x_data, [
                tick.get_text()
                for tick in bar_plot.get_xticklabels(which='major')
            ])
        with self.subTest(name='test y axis variable is equal'):
            self.assertEqual(y_var, bar_plot.get_ylabel())
        with self.subTest(name='test y axis data is equal'):
            self.assertListEqual(y_data,
                                 [h.get_height() for h in bar_plot.patches])
        with self.subTest(name='test title is equal'):
            self.assertEqual(title, bar_plot.get_title())
コード例 #2
0
def _plot_categorical_fact(
        plot_data: pd.DataFrame, fact_name: str,
        plot_style_params: _FactPlotStyles) -> List[axes.Axes]:
    """Plots the statistics of a categorical fact variable.

  Generates plots of daily record count, latest distribution of top N levels
  and daily distribution of top N level over time of a categorical fact
  variable.

  Args:
    plot_data: Data to plot containing date, total_count, value and percentage
      columns.
    fact_name: Name of the fact variable.
    plot_style_params: Plot style parameters.

  Returns:
    plots: A list of Axes containing 3 plots.
  """
    logging.info('Plotting categorical fact %s ', fact_name)

    _, plots = pyplot.subplots(nrows=_ROWS_IN_SUBPLOTS_GRID,
                               ncols=_COLS_IN_SUBPLOTS_GRID,
                               figsize=(plot_style_params.fig_width,
                                        plot_style_params.fig_height))

    latest_date = max(plot_data['date'])
    latest_date_stats = plot_data[plot_data['date'] ==
                                  latest_date].sort_values(by=['percentage'],
                                                           ascending=False)

    common_lineplot_params = {
        'axes': plots,
        'title_fontsize': plot_style_params.lineplot_title_fontsize,
        'xticklabels_fontsize':
        plot_style_params.lineplot_xticklabels_fontsize,
        'yticklabels_fontsize': plot_style_params.lineplot_yticklabels_fontsize
    }

    # plot daily total fact count
    viz_utils.plot_line(plot_data=plot_data[['date', 'total_record_count'
                                             ]].drop_duplicates(),
                        x_variable='date',
                        y_variable='total_record_count',
                        title=f'{fact_name} - Daily Fact Count',
                        subplot_index=0,
                        line_color=plot_style_params.line_color_record_count,
                        **common_lineplot_params)

    # plot the latest distribution of the top N fact levels.
    viz_utils.plot_bar(
        plot_data=latest_date_stats,
        x_variable='category_value',
        y_variable='percentage',
        title=f'{fact_name} - Latest Value Distribution (%)',
        axes=plots,
        subplot_index=1,
        title_fontsize=plot_style_params.barplot_title_fontsize,
        xticklabels_fontsize=plot_style_params.barplot_xticklabels_fontsize,
        yticklabels_fontsize=plot_style_params.barplot_yticklabels_fontsize)

    # plot the daily distribution of the top N fact levels over time.
    viz_utils.plot_line(
        plot_data=plot_data,
        x_variable='date',
        y_variable='percentage',
        title=f'{fact_name} - Daily value distribution (%)',
        subplot_index=2,
        category_variable='category_value',
        legend_fontsize=plot_style_params.lineplot_legend_fontsize,
        **common_lineplot_params)

    return plots
コード例 #3
0
def _plot_categorical_feature_numerical_label(
        label_stats_data: pd.DataFrame, feature_stats_data: pd.DataFrame,
        feature_name: str,
        plot_style_params: _FeaturePlotStyles) -> List[axes.Axes]:
    """Plots the statistics of a categorical feature when label is numerical.

  Generates following plots:
  - distribution of the label (box plots) for different category values of the
      feature.
  - distribution of feature values (stacked bor plots) by snapshot_date.

  Args:
    label_stats_data: Plot data containing the following columns: feature,
      value, mean, stddev, med, q1, q3, whislo and whishi.
    feature_stats_data: Plot data containing the following columns:
      snapshot_date, record_count, prop_missing, prop_non_num, average and
      stddev.
    feature_name: Name of the feature.
    plot_style_params: Plot style parameters.

  Returns:
     plots: A list of Axes containing 2 plots.
  """
    logging.info('Plotting categorical feature %s', feature_name)

    _, plots = pyplot.subplots(nrows=_NO_PLOTS_CAT_FEATURE_NUM_LABEL,
                               ncols=_COLS_IN_SUBPLOTS_GRID,
                               figsize=(plot_style_params.fig_width,
                                        plot_style_params.fig_height))

    # Plot distribution of the label by different feature values (categories)
    logging.info('Plotting label distribution by feature category values.')
    viz_utils.plot_box(
        plot_data=label_stats_data,
        title=f'Label distribution by [{feature_name}] categories',
        axes=plots,
        subplot_index=0,
        x_variable='value',
        x_label='Category value',
        y_label='Label distribution',
        title_fontsize=plot_style_params.title_fontsize,
        xlabel_fontsize=plot_style_params.xlabel_fontsize,
        ylabel_fontsize=plot_style_params.ylabel_fontsize,
        xticklabels_fontsize=plot_style_params.xticklabels_fontsize,
        yticklabels_fontsize=plot_style_params.yticklabels_fontsize,
        xticklabels_rotation=0)

    # Plot snapshot-level feature distribution
    viz_utils.plot_bar(
        plot_data=feature_stats_data,
        x_variable='snapshot_date',
        y_variable='percentage',
        group_variable='value',
        stacked_bars=True,
        title=f'Snapshot-level distribution of [{feature_name}]',
        subplot_index=1,
        axes=plots,
        title_fontsize=plot_style_params.title_fontsize,
        xlabel_fontsize=plot_style_params.xlabel_fontsize,
        ylabel_fontsize=plot_style_params.ylabel_fontsize,
        xticklabels_fontsize=plot_style_params.xticklabels_fontsize,
        yticklabels_fontsize=plot_style_params.yticklabels_fontsize,
        xticklabels_rotation=45,
    )

    return plots
コード例 #4
0
def _plot_categorical_feature_binary_label(
        df_data: pd.DataFrame, feature_name: str, label_column: str,
        positive_class_label: LabelType, negative_class_label: LabelType,
        plot_style_params: _FeaturePlotStyles) -> List[axes.Axes]:
    """Plots the statistics of a categorical feature when label is binary.

  Generates following plots of the feature:
  - distribution of values (bor plots) by label.
  - distribution of values (stacked bor plots) for positive instances by
      snapshot_date.
  - distribution of values (stacked bor plots) for negative instances by
      snapshot_date.

  Args:
    df_data: plot data containing the following columns: snapshot_date, label,
      record_count, prop_missing, prop_non_num, average, stddev columns.
    feature_name: Name of the feature.
    label_column: Name of the label column.
    positive_class_label: label for positive class
    negative_class_label: label for negative class
    plot_style_params: Plot style parameters.

  Returns:
     plots: A list of Axes containing 3 plots.
  """
    logging.info('Plotting categorical feature %s', feature_name)

    _, plots = pyplot.subplots(nrows=_NO_PLOTS_CAT_FEATURE_BINARY_LABEL,
                               ncols=_COLS_IN_SUBPLOTS_GRID,
                               figsize=(plot_style_params.fig_width,
                                        plot_style_params.fig_height))

    # Aggregating dataframe on date level to get data for the category
    # distribution plot.
    df_value_count = df_data.groupby([label_column,
                                      'value'])[['count']].sum().reset_index()

    df_total_count = df_data.groupby(label_column)[['count'
                                                    ]].sum().reset_index()
    df_total_count = df_total_count.rename(columns={'count': 'total_count'})

    # Joining total counts and calculating proportions.
    df_value_proportions = df_value_count.merge(df_total_count,
                                                on=label_column)
    df_value_proportions['percentage'] = (
        df_value_proportions['count'] /
        df_value_proportions['total_count']) * 100

    common_barplot_params = {
        'axes': plots,
        'title_fontsize': plot_style_params.title_fontsize,
        'xlabel_fontsize': plot_style_params.xlabel_fontsize,
        'ylabel_fontsize': plot_style_params.ylabel_fontsize,
        'xticklabels_fontsize': plot_style_params.xticklabels_fontsize,
        'yticklabels_fontsize': plot_style_params.yticklabels_fontsize
    }

    # Plot distribution of the feature values by label
    viz_utils.plot_bar(plot_data=df_value_proportions,
                       x_variable='value',
                       y_variable='percentage',
                       group_variable=label_column,
                       title=f'Distribution of [{feature_name}]',
                       subplot_index=0,
                       **common_barplot_params)

    # Plot the snapshot-level distribution of the feature for positive instances
    pos_instance_stats = df_data[df_data[label_column] == positive_class_label]
    pos_instance_stats = pos_instance_stats.sort_values(
        ['snapshot_date', 'feature'], ascending=True)

    pos_plot_title = (f'Snapshot-level distribution of [{feature_name}] for '
                      f'label = {positive_class_label}')
    viz_utils.plot_bar(
        plot_data=pos_instance_stats,
        x_variable='snapshot_date',
        y_variable='percentage',
        group_variable='value',
        stacked_bars=True,
        title=pos_plot_title,
        subplot_index=1,
        xticklabels_rotation=45,
        x_label='',  # to better arrange the output plots
        **common_barplot_params)

    # Plot the snapshot-level distribution of the feature for negative instances
    neg_instance_stats = df_data[df_data[label_column] == negative_class_label]
    neg_instance_stats = neg_instance_stats.sort_values(
        ['snapshot_date', 'feature'], ascending=True)

    neg_plot_title = (f'Snapshot-level distribution of [{feature_name}] for '
                      f'label = {negative_class_label}')
    viz_utils.plot_bar(
        plot_data=neg_instance_stats,
        x_variable='snapshot_date',
        y_variable='percentage',
        group_variable='value',
        stacked_bars=True,
        title=neg_plot_title,
        subplot_index=2,
        xticklabels_rotation=45,
        **common_barplot_params,
    )

    return plots