コード例 #1
0
def calc_performance_metrics(
        labels: np.ndarray,
        predictions: np.ndarray,
        decimal_points: Optional[int] = 4) -> _PerformanceMetrics:
    """Calculates performance metrics related to a regression model.

  Args:
    labels: An array of true labels containing numeric values.
    predictions: An array of predictions containing numeric values.
    decimal_points: Number of decimal points to use when outputting the
      calculated performance metrics.

  Returns:
    Object of _PerformanceMetrics class containing the regression diagnostics
      metrics.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    mse = metrics.mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    mae = metrics.mean_absolute_error(labels, predictions)
    mape = metrics.mean_absolute_percentage_error(labels, predictions)
    r2 = metrics.r2_score(labels, predictions)
    corr = sp.stats.pearsonr(labels, predictions)[0]

    return _PerformanceMetrics(
        mean_squared_error=round(mse, decimal_points),
        root_mean_squared_error=round(rmse, decimal_points),
        mean_absolute_error=round(mae, decimal_points),
        mean_absolute_percentage_error=round(mape, decimal_points),
        r_squared=round(r2, decimal_points),
        pearson_correlation=round(corr, decimal_points))
コード例 #2
0
def plot_prediction_residuals(labels: np.ndarray,
                              predictions: np.ndarray,
                              fig_width: Optional[int] = 12,
                              fig_height: Optional[int] = 12,
                              title_fontsize: Optional[int] = 12,
                              axis_label_fontsize: Optional[int] = 10,
                              use_log: Optional[bool] = False) -> axes.Axes:
    """Plots scatter plots of true labels and residuals versus the predicted values.

  Args:
    labels: An array of true labels containing numeric values.
    predictions: An array of predictions containing numeric values.
    fig_width: Width of the figure.
    fig_height: Height of the figure.
    title_fontsize: Title font size of the plots.
    axis_label_fontsize: Axis label font size of the plots.
    use_log: Boolean value indicating taking logarithm of the actual and
      predicted values.

  Returns:
    plots: Scatter plots of true values and residuals versus the predicted
    values.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    _, plots = pyplot.subplots(nrows=2, figsize=(fig_width, fig_height))
    if use_log:
        plots[0].scatter(x=np.log1p(predictions), y=np.log1p(labels))
        plots[0].set_title(
            'Scatter plot of true label values versus predicted values with log transformation',
            fontsize=title_fontsize)
        plots[0].set_xlabel('Logarithm of predicted values',
                            fontsize=axis_label_fontsize)
        plots[0].set_ylabel('Logarithm of label values',
                            fontsize=axis_label_fontsize)
    else:
        plots[0].scatter(x=predictions, y=labels)
        plots[0].set_title(
            'Scatter plot of true label values versus predicted values',
            fontsize=title_fontsize)
        plots[0].set_xlabel('Predicted values', fontsize=axis_label_fontsize)
        plots[0].set_ylabel('Label values', fontsize=axis_label_fontsize)

    plots[1].scatter(x=predictions, y=labels - predictions)
    plots[1].set_title('Scatter plot of residuals versus predicted values',
                       fontsize=title_fontsize)
    plots[1].set_xlabel('Predicted values', fontsize=axis_label_fontsize)
    plots[1].set_ylabel('Residuals', fontsize=axis_label_fontsize)
    plots[1].axhline(0, linestyle='--')

    return plots
コード例 #3
0
def plot_predicted_probabilities(labels: np.ndarray,
                                 probability_predictions: np.ndarray,
                                 colors: Optional[Sequence[str]] = ('b', 'g'),
                                 print_stats: bool = True,
                                 fig_width: Optional[int] = 20,
                                 fig_height: Optional[int] = 15) -> axes.Axes:
    """Plots the distributions of predicted probabilities for each class.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    colors: Colors for the probability plots.
    print_stats: Flag that whether the stats of probabilities are plotted
    fig_width: Width of the figure.
    fig_height: Height of the figure.

  Returns:
    plots: Class density plots of the predicted probabilities.
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)
    assert len(np.unique(labels)) == len(colors),\
           'number of colors should be the same as number of unique labels.'

    unique_labels = np.sort(np.unique(labels))

    _, plots = pyplot.subplots(figsize=(fig_width, fig_height))
    for color, label in zip(colors, unique_labels):
        index_plot = np.where(labels == label)[0]
        preds_plot = probability_predictions[index_plot]
        label_plot = 'class[%s]' % (str(label))
        if print_stats:
            label_plot += ': mean=%.4f, std=%.4f, median=%.4f' % (
                np.mean(preds_plot), np.std(preds_plot), np.median(preds_plot))
        sns.kdeplot(preds_plot,
                    shade=True,
                    color=color,
                    label=label_plot,
                    ax=plots)

    pyplot.title('Distribution of predicted probabilities')
    pyplot.legend()
    pyplot.xlabel('Probability')
    pyplot.ylabel('Density')
    pyplot.xlim([0, 1])

    return plots
コード例 #4
0
def plot_confusion_matrix(labels: np.ndarray,
                          predictions: np.ndarray,
                          class_label_names: Optional[Dict[Union[str, int],
                                                           Union[str,
                                                                 int]]] = None,
                          normalize: Optional[str] = None,
                          title_fontsize: Optional[int] = 12,
                          x_label_fontsize: Optional[int] = 12,
                          y_label_fontsize: Optional[int] = 12,
                          heatmap_color: Optional[str] = 'Greens') -> None:
    """Plot confusion matrix for a multiclass classification model.

  Args:
    labels: An array of true labels containing multiclass labels.
    predictions: An array of predictions containing multiclass labels.
    class_label_names: Dictionary of multiclass labels and corresponding target
      names. The type of both class lable and target names can be either 'int'
      or 'str'. E.g. {0: 'low_value', 1: 'mid_value', 2: 'high_value'}.
    normalize: A parameter controlling whether to normalize the counts in the
      matrix.
    title_fontsize: Font size of the figure title.
    x_label_fontsize: Font size of the x axis labels.
    y_label_fontsize: Font size of the y axis labels.
    heatmap_color: Color of the heatmap plot.

  Returns:
    Heatmap of confusion matrix.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    if class_label_names is None:
        class_labels = list(set(labels))
        target_names = ['%s' % l for l in class_labels]
    else:
        class_labels = list(class_label_names.keys())
        target_names = list(class_label_names.values())

    plot = ConfusionMatrixDisplay.from_predictions(y_true=labels,
                                                   y_pred=predictions,
                                                   labels=np.unique(labels),
                                                   display_labels=target_names,
                                                   normalize=normalize,
                                                   include_values=True,
                                                   cmap=heatmap_color)
    plot.ax_.set_title('Confusion matrix', fontsize=title_fontsize)
    plot.ax_.set_xlabel('Predicted label', fontsize=x_label_fontsize)
    plot.ax_.set_ylabel('Actual label', fontsize=y_label_fontsize)
    plt.show()
コード例 #5
0
def plot_precision_recall_curve(
        labels: np.ndarray,
        probability_predictions: np.ndarray,
        print_stats: bool = True,
        fig_width: Optional[int] = 8,
        fig_height: Optional[int] = 8,
        curve_color: Optional[str] = 'blue') -> axes.Axes:
    """Plots the Precision-Recall curve for the predictions.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    print_stats: Flag that whether the Average Precision is plotted.
    fig_width: Width of the figure.
    fig_height: Height of the figure.
    curve_color: Color of the Precision-Recall curve.

  Returns:
      plots: Class density plots of the ROC curve.
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    precision, recall, _ = sklearn.metrics.precision_recall_curve(
        labels, probability_predictions)
    _, plots = pyplot.subplots(figsize=(fig_width, fig_height))
    pyplot.plot(recall,
                precision,
                marker='.',
                label='Precision-Recall',
                color=curve_color)

    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')

    pyplot.legend()
    if print_stats:
        pyplot.title('Average Precision=%.4f' %
                     sklearn.metrics.average_precision_score(
                         labels, probability_predictions))

    return plots
コード例 #6
0
def plot_binned_preds_labels(labels: np.ndarray,
                             predictions: np.ndarray,
                             number_bins: Optional[int] = 10,
                             fig_width: Optional[int] = 10,
                             fig_height: Optional[int] = 7) -> axes.Axes:
    """Plots the actual label distributions (box plots) for prediction bins.

  Args:
    labels: An array of true labels containing numeric values.
    predictions: An array of predictions containing numeric values.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is deciles (10 bins) such that the 1st bin contains the
      highest 10% of the predictions, the 2nd bin contains the next 10% of the
      predictions and so on.
    fig_width: Width of the figure.
    fig_height: Height of the figure.

  Returns:
    plot: Box plots of the actual label distributions for prediction bins.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    # Separate the predictions into bins.
    data = pd.DataFrame(list(zip(labels, predictions)),
                        columns=['labels', 'predictions'])
    data = data.sort_values('predictions', ascending=False)
    data['prediction_rank'] = range(data.shape[0])
    # To avoid duplicate edges of bins use the index in the qcat function below.
    data['prediction_bin_number'] = pd.qcut(
        data['prediction_rank'], q=number_bins, labels=False) + 1

    _, plot = pyplot.subplots(figsize=(fig_width, fig_height))
    data.pivot(columns='prediction_bin_number', values='labels').boxplot()
    plot.set_title('Distribution of actual labels over prediction bins')
    plot.set_ylabel('Actual label distribution')
    plot.set_xlabel('Prediction bin [Highest to Lowest]')

    return plot
コード例 #7
0
def plot_confusion_matrix_bin_heatmap(
        labels: np.ndarray,
        predictions: np.ndarray,
        number_bins: Optional[int] = 10,
        normalize: Optional[str] = 'true',
        fig_width: Optional[int] = 12,
        fig_height: Optional[int] = 12,
        title_fontsize: Optional[int] = 12,
        axis_label_fontsize: Optional[int] = 10,
        heatmap_color: Optional[str] = 'YlGnBu') -> axes.Axes:
    """Plots the heatmap of the bins of the actual and predicted values.

  Args:
    labels: An array of true labels containing numeric values.
    predictions: An array of predictions containing numeric values.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is deciles (10 bins) such that the 1st bin contains the
      highest 10% of the predictions, the 2nd bin contains the next 10% of the
      predictions and so on.
    normalize: Normalizes confusion matrix over the true labels (rows),
      predictions (columns) conditions or all the population. Takes the values
      'true', 'pred' and 'all' respectively.
    fig_width: Width of the figure.
    fig_height: Height of the figure.
    title_fontsize: Title font size of the plots.
    axis_label_fontsize: Axis label font size of the plots.
    heatmap_color: Color of the heatmap plot.

  Returns:
    plot: Heatmap of the bins of the actual and predicted values.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    assert str(normalize).lower() in [
        'true', 'pred', 'all'
    ], ("normalize parameter value should be either 'true', 'pred' or 'all'")

    data = pd.DataFrame(list(zip(labels, predictions)),
                        columns=['labels', 'predictions'])
    data = data.sort_values('labels', ascending=False)
    data['labels_rank'] = range(data.shape[0])
    data = data.sort_values('predictions', ascending=False)
    data['prediction_rank'] = range(data.shape[0])
    data['bin_number_label'] = pd.qcut(data['labels_rank'],
                                       q=number_bins,
                                       labels=False)
    data['bin_number_predictions'] = pd.qcut(data['prediction_rank'],
                                             q=number_bins,
                                             labels=False)

    conf_matrix = metrics.confusion_matrix(
        y_true=data['bin_number_label'],
        y_pred=data['bin_number_predictions'],
        normalize=normalize)

    tick_labels = range(1, number_bins + 1)
    _, plot = pyplot.subplots(figsize=(fig_width, fig_height))
    plot = sns.heatmap(conf_matrix,
                       cbar=False,
                       cmap=heatmap_color,
                       annot=True,
                       xticklabels=tick_labels,
                       yticklabels=tick_labels)
    plot.set_title('Heatmap of the bins of the actual and predicted values',
                   fontsize=title_fontsize)
    plot.set_xlabel('Prediction value bins [Highest to Lowest]',
                    fontsize=axis_label_fontsize)
    plot.set_ylabel('Actual value bins [Highest to Lowest]',
                    fontsize=axis_label_fontsize)

    return plot
コード例 #8
0
def calc_reg_bin_metrics(labels: np.ndarray,
                         predictions: np.ndarray,
                         number_bins: Optional[int] = 10,
                         decimal_points: Optional[int] = 4) -> pd.DataFrame:
    """Calculates performance metrics for each bin of the predictions.

  Args:
    labels: An array of true labels containing numeric values.
    predictions: An array of predictions containing numeric values.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is deciles (10 bins) such that the 1st bin contains the
      highest 10% of the predictions, the 2nd bin contains the next 10% of the
      predictions and so on.
    decimal_points: Number of decimal points to use when outputting the
      calculated performance metrics.

  Returns:
    bin_metrics: Following metrics calculated for each bin:
      mean_label: Mean of actual values in the bin.
      mean_prediction: Mean of predictions in the bin.
      rmse: Root mean squared error.
      mape: Mean absolute percentage error.
      corr: pearson_correlation coefficient.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)

    # Separate the predictions into bins.
    binned_data = pd.DataFrame(list(zip(labels, predictions)),
                               columns=['label', 'prediction'])
    binned_data = binned_data.sort_values('prediction').reset_index()
    # To avoid duplicate edges of bins use the index in the qcat function below.
    binned_data['bin_number'] = pd.qcut(binned_data.index,
                                        q=number_bins,
                                        labels=False)

    bin_metrics = binned_data.groupby('bin_number', as_index=False).agg({
        'label':
        'mean',
        'prediction':
        'mean'
    }).rename(columns={
        'label': 'mean_label',
        'prediction': 'mean_prediction'
    })
    bin_metrics['mean_label'] = round(bin_metrics['mean_label'],
                                      decimal_points)
    bin_metrics['mean_prediction'] = round(bin_metrics['mean_prediction'],
                                           decimal_points)
    bin_metrics['rmse'] = 0
    bin_metrics['mape'] = 0
    bin_metrics['corr'] = 0

    for i in range(number_bins):
        (bin_labels, bin_predictions) = (
            binned_data[binned_data.bin_number == i]['label'].values,
            binned_data[binned_data.bin_number == i]['prediction'].values)
        bin_perf_metrics = calc_performance_metrics(bin_labels,
                                                    bin_predictions,
                                                    decimal_points)
        bin_metrics.loc[i, 'rmse'] = bin_perf_metrics.root_mean_squared_error
        bin_metrics.loc[
            i, 'mape'] = bin_perf_metrics.mean_absolute_percentage_error
        bin_metrics.loc[i, 'corr'] = bin_perf_metrics.pearson_correlation

    bin_metrics['bin_number'] = number_bins - bin_metrics['bin_number']
    bin_metrics = bin_metrics.sort_values(['bin_number'])

    return bin_metrics
コード例 #9
0
def calc_chisquared_sample_sizes_for_bins(
    labels: np.ndarray,
    probability_predictions: np.ndarray,
    number_bins: int = 3,
    uplift_percentages: Sequence[np.float64] = (10, 20),
    power_percentages: Sequence[np.float64] = (80, 90),
    confidence_level_percentages: Sequence[np.float64] = (90, 95)
) -> pd.DataFrame:
    """Calculates statistical sample sizes for the bins defined on predictions.

  These sample sizes for the bins defined on the predicted probabilities are
    estimated using the Chi-squared test of proportions for each combination
    of uplift_percentage, power_percentage and confidence_level_percentage.
    These sizes could be used as the minimum required size for each Test or
    Control group when designing an experiment to target users from each of
    these bins of predictions.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is 3 bins such that the 1st bin contains the
      highest 1/3rd of the predictions (High Propensity group), the 2nd bin
      contains the next 1/3rd of the predictions (Medium Propensity group) and
      the last bin contains the lowest 1/3rd of the predictions (Lowest
      Propensity group).
    uplift_percentages: Sequence of different expected uplift percentages.
    power_percentages: Sequence of different statistical powers for the test.
    confidence_level_percentages: Sequence of different statistical confidence
      levels for the test.

  Returns:
    bin_metrics: Following metrics calculated for each bin of the predictions.
     bin_number: Bin number starting from 1.
     bin_size: Total numbers of instances in the bin.
     min_probability: Minimum predicted probability within the bin.
     conversion_rate: Proportion of positive instances out of all the instances
       in the bin (precision).
     expected_uplift: Expected uplift_percentage.
     power_percentage: Statistical power of the test.
     confidence_level_percentage: Statistical confidence level of the test.
     required_sample_size: Statistical sample size required.
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    # separate the probability_predictions into bins of equal size
    binned_data = pd.DataFrame(list(zip(labels, probability_predictions)),
                               columns=['label', 'prediction'])
    binned_data = binned_data.sort_values('prediction').reset_index()
    # to avoid duplicate edges of bins use the index in the qcat function below
    binned_data['bin_number'] = pd.qcut(binned_data.index,
                                        q=number_bins,
                                        labels=False)

    # calculate the conversion rate for each bin
    total_instances = (binned_data[['bin_number',
                                    'label']].groupby('bin_number').count())
    total_instances.columns = ['bin_size']
    total_instances = total_instances.reset_index()
    positive_instances = (binned_data.loc[binned_data['label'] > 0][[
        'bin_number', 'label'
    ]].groupby('bin_number').count())
    positive_instances.columns = ['positive_instances']
    positive_instances = positive_instances.reset_index()

    bin_conv_rate = pd.merge(total_instances,
                             positive_instances,
                             on='bin_number',
                             how='left')
    bin_conv_rate.fillna(0, inplace=True)
    bin_conv_rate['conversion_rate'] = round(
        (bin_conv_rate['positive_instances'] / bin_conv_rate['bin_size'] *
         100), 2)

    bin_metrics_list = list()
    for bin_number in bin_conv_rate['bin_number']:
        conv_rate = bin_conv_rate['conversion_rate'][bin_number]
        bin_size = bin_conv_rate['bin_size'][bin_number]
        min_prob = min(
            binned_data[binned_data['bin_number'] == bin_number]['prediction'])
        for uplift_percentage in uplift_percentages:
            for power_percentage in power_percentages:
                for confidence_level_percentage in confidence_level_percentages:
                    sample_size = calc_chisquared_sample_size(
                        conv_rate, uplift_percentage, power_percentage,
                        confidence_level_percentage)
                    bin_metrics_list.append(
                        (bin_number, bin_size, min_prob, conv_rate,
                         uplift_percentage, power_percentage,
                         confidence_level_percentage, sample_size))

    bin_metrics = pd.DataFrame(bin_metrics_list,
                               columns=[
                                   'bin_number', 'bin_size', 'min_probability',
                                   'conv_rate_percentage', 'uplift_percentage',
                                   'power_percentage',
                                   'confidence_level_percentage',
                                   'required_sample_size'
                               ])

    # reverse the order of bin numbers such that bin 1 has the highest
    # predicted probability
    bin_metrics['bin_number'] = number_bins - bin_metrics['bin_number']
    bin_metrics = bin_metrics.sort_values(['bin_number'
                                           ]).reset_index(drop=True)

    return bin_metrics
コード例 #10
0
def calc_t_sample_sizes_for_cumulative_bins(
    labels: np.ndarray,
    numeric_predictions: np.ndarray,
    number_bins: int = 10,
    uplift_percentages: Sequence[np.float64] = (10, 20),
    power_percentages: Sequence[np.float64] = (80, 90),
    confidence_level_percentages: Sequence[np.float64] = (90, 95)
) -> pd.DataFrame:
    """Calculates statistical sample sizes for the cumulative bins of predictions.

  These sample sizes are estimated using the T-test for each combination of
  uplift_percentage, power_percentage and confidence_level_percentage for the
  cumulative bins of numeric predictions from a regression model. These
  sizes could be used as the minimum required sizes for each Test or Control
  group when designing a media experiment to target users having the top X% of
  predicted values.

  Args:
    labels: An array of actual numeric label.
    numeric_predictions: An array of numeric predictions.
    number_bins: Number of cumulative bins that we want to divide the ranked
      predictions into. Default is deciles (10 bins) such that the 1st bin
      contains the highest 10% of the predictions, the 2nd bin contains the
      highest 20% of the predictions and so on.
    uplift_percentages: Sequence of different expected uplift percentages.
    power_percentages: Sequence of different statistical power percentages.
    confidence_level_percentages: Sequence of different statistical confidence
      level percentages.

  Returns:
    bin_metrics: Following metrics calculated for each cumulative bin.
      cumulative_bin_number: Bin number starting from 1 for the bin having
        largest predicted values.
      bin_size: Total numbers of instances in the bin.
      bin_size_percentage: Percentage of instances in the bin out of all the
        instances in the labels.
      min_predicted_val: Minimum predicted value of the bin.
      average_actual_val: Average actual label value of the bin.
      stdev_actual_val: Standard deviation of actual label value of the bin.
      expected_uplift: Expected uplift percentage of the test.
      power_percentage: Statistical power percentage of the test.
      confidence_level_percentage: Statistical confidence level percentage of
        the test.
      required_sample_size: Statistical sample size required.
  """
    utils.assert_label_and_prediction_length_match(labels, numeric_predictions)

    # Separate the probability_predictions into bins
    label_predictions = pd.DataFrame(list(zip(labels, numeric_predictions)),
                                     columns=['label', 'prediction'])
    label_predictions = label_predictions.sort_values(by='prediction',
                                                      ascending=False)
    number_total_instances = label_predictions.shape[0]
    equal_bin_size = number_total_instances / number_bins

    # Calculate the stats for cumulative bins
    cumulative_bin_metrics_list = []

    for bin_number in range(1, (number_bins + 1)):
        current_bin_size = round(equal_bin_size * bin_number)
        bin_size_percentage = round(
            current_bin_size / number_total_instances * 100, 2)
        bin_instances = label_predictions.head(current_bin_size)
        mean_actual_val = round(np.mean(bin_instances['prediction']), 2)
        stdev_actual_val = round(np.std(bin_instances['prediction']), 2)
        min_predicted_val = min(bin_instances['prediction'])

        for uplift_percentage in uplift_percentages:
            for power_percentage in power_percentages:
                for confidence_level_percentage in confidence_level_percentages:
                    sample_size = calc_t_sample_size(
                        mean_actual_val, stdev_actual_val, uplift_percentage,
                        power_percentage, confidence_level_percentage)
                    cumulative_bin_metrics_list.append(
                        (bin_number, current_bin_size, bin_size_percentage,
                         min_predicted_val, mean_actual_val, stdev_actual_val,
                         uplift_percentage, power_percentage,
                         confidence_level_percentage, sample_size))

    return pd.DataFrame(cumulative_bin_metrics_list,
                        columns=[
                            'cumulative_bin_number', 'bin_size',
                            'bin_size_percentage', 'min_predicted_val',
                            'mean_actual_val', 'stdev_actual_val',
                            'uplift_percentage', 'power_percentage',
                            'confidence_level_percentage',
                            'required_sample_size'
                        ])
コード例 #11
0
def calc_t_sample_sizes_for_bins(
    labels: np.ndarray,
    numeric_predictions: np.ndarray,
    number_bins: int = 3,
    uplift_percentages: Sequence[np.float64] = (10, 20),
    power_percentages: Sequence[np.float64] = (80, 90),
    confidence_level_percentages: Sequence[np.float64] = (90, 95)
) -> pd.DataFrame:
    """Calculates statistical sample sizes for the bins of numeric predictions.

  These sample sizes (for the bins defined on the numeric predictions from a
    regression model) are estimated using the T-test for each combination of
    uplift_percentage, power_percentage and confidence_level_percentage.
    These sizes could be used as the minimum required size for each Test or
    Control group when designing a media experiment to target users from each
    prediction bin.

  Args:
    labels: An array of actual numeric label.
    numeric_predictions: An array of numetric predictions.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is 3 bins such that the 1st bin contains the
      highest 1/3rd of the predictions (High value group), the 2nd bin
      contains the next 1/3rd of the predictions (Medium value group) and
      the last bin contains the lowest 1/3rd of the predictions (Low value
      group).
    uplift_percentages: Sequence of different expected uplift percentages.
    power_percentages: Sequence of different statistical power percenrtages.
    confidence_level_percentages: Sequence of different statistical confidence
      level percentages.

  Returns:
    bin_metrics: Following metrics calculated for each bin of the predictions.
     bin_number: Bin number starting from 1 for the bin with the highest values.
     bin_size: Total numbers of instances in the bin.
     min_predicted_val: Minimum predicted value within the bin.
     average_actual_val: Average actual label value of the bin.
     stdev_actual_val: Standard deviation of actual label value of the bin.
     expected_uplift: Expected uplift percentage of the test.
     power_percentage: Statistical power percentage of the test.
     confidence_level_percentage: Statistical confidence level percentage
       of the test.
     required_sample_size: Statistical sample size required.
  """
    utils.assert_label_and_prediction_length_match(labels, numeric_predictions)

    # Separate the probability_predictions into bins of equal size
    binned_data = pd.DataFrame(list(zip(labels, numeric_predictions)),
                               columns=['label', 'prediction'])
    binned_data = binned_data.sort_values(
        'prediction',
        ascending=False,
    ).reset_index(drop=True)
    # To avoid duplicate edges of bins use the index in the qcut function below
    binned_data['bin_number'] = pd.qcut(binned_data.index,
                                        q=number_bins,
                                        labels=False)

    # Calculate the statistics for each bin
    bin_stats = (binned_data.groupby(['bin_number'
                                      ]).agg(['min', 'mean', 'std',
                                              'count']).reset_index())
    bin_stats.columns = [
        'bin_number', 'min_label', 'mean_label', 'stdev_label',
        'bin_label_size', 'min_pred', 'mean_pred', 'stdev_pred',
        'bin_pred_size'
    ]

    bin_metrics_list = list()
    for bin_number in bin_stats['bin_number']:
        mean_label = bin_stats['mean_label'][bin_number]
        stdev_label = bin_stats['stdev_label'][bin_number]
        bin_size = bin_stats['bin_label_size'][bin_number]
        min_pred_val = bin_stats['min_pred'][bin_number]
        for uplift_percentage in uplift_percentages:
            for power_percentage in power_percentages:
                for confidence_level_percentage in confidence_level_percentages:
                    sample_size = calc_t_sample_size(
                        mean_label, stdev_label, uplift_percentage,
                        power_percentage, confidence_level_percentage)
                    bin_metrics_list.append(
                        (bin_number, bin_size, min_pred_val, mean_label,
                         stdev_label, uplift_percentage, power_percentage,
                         confidence_level_percentage, sample_size))

    bin_metrics = pd.DataFrame(bin_metrics_list,
                               columns=[
                                   'bin_number', 'bin_size',
                                   'min_predicted_val', 'average_actual_val',
                                   'stdev_actual_val', 'uplift_percentage',
                                   'power_percentage',
                                   'confidence_level_percentage',
                                   'required_sample_size'
                               ])

    # Start the bin numbers from 1
    bin_metrics['bin_number'] = bin_metrics['bin_number'] + 1

    return bin_metrics
コード例 #12
0
def calc_performance_metrics(
    labels: np.ndarray,
    probability_predictions: np.ndarray,
    binarize_threshold: Optional[float] = None,
    decimal_points: Optional[int] = 4,
) -> Dict[str, float]:
    """Calculates performance metrics related to a binary classification model.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    binarize_threshold: Probability threshold to be used to binarize the
      predicted probabilities. By default the proportion of positive instances
      in the labels is used.
    decimal_points: Number of decimal points to use when outputting the
      calculated performance metrics.

  Returns:
    metrics: Dictionary of the following performance metric
      {prop_positives: Proportion of instances where label = 1.0,
       auc_roc: Area under the recall vs (1-specificity) (ROC) curve,
       auc_pr: Area under the recall vs precision (ROC) curve.
       Following metrics are calculated after binarizing the predicted
       probabilities based on the given binarize_threshold,
       accuracy: Total accuracy of the predictions,
       true_positive_rate (recall or sensitivity): True positive rate,
       true_negative_rate (specificity): True negative rate,
       precision: Precision (confidence) of the true positive predictions,
       f1_score: F1 score of sensitivity and specificity,
       precision_uplift: Uplift of the precision compared to random prediction}
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    num_positives = labels.sum()
    prop_positives = float(num_positives) / len(labels)

    if binarize_threshold is None:
        binarize_threshold = prop_positives

    # Calculate auc metrics.
    auc_roc = round(
        sklearn.metrics.roc_auc_score(labels, probability_predictions),
        decimal_points)
    auc_pr = round(
        sklearn.metrics.average_precision_score(labels,
                                                probability_predictions),
        decimal_points)

    # Binarize the predictions.
    binarized_predictions = ((probability_predictions >
                              binarize_threshold).astype(int))

    # Calculate metrics based on binarized predictions.
    accuracy = sklearn.metrics.accuracy_score(labels, binarized_predictions)
    tp_rate = sklearn.metrics.recall_score(labels,
                                           binarized_predictions,
                                           pos_label=1)
    tn_rate = sklearn.metrics.recall_score(labels,
                                           binarized_predictions,
                                           pos_label=0)
    precision = sklearn.metrics.precision_score(labels, binarized_predictions)
    f1_score = sklearn.metrics.f1_score(labels, binarized_predictions)

    return {
        'prop_positives': round(prop_positives, decimal_points),
        'auc_roc': round(auc_roc, decimal_points),
        'auc_pr': round(auc_pr, decimal_points),
        'binarize_threshold': round(binarize_threshold, decimal_points),
        'accuracy': round(accuracy, decimal_points),
        'true_positive_rate': round(tp_rate, decimal_points),
        'true_negative_rate': round(tn_rate, decimal_points),
        'precision': round(precision, decimal_points),
        'f1_score': round(f1_score, decimal_points)
    }
コード例 #13
0
def calc_cumulative_bin_metrics(
        labels: np.ndarray,
        probability_predictions: np.ndarray,
        number_bins: int = 10,
        decimal_points: Optional[int] = 4) -> pd.DataFrame:
    """Calculates performance metrics for cumulative bins of the predictions.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    number_bins: Number of cumulative bins that we want to divide the ranked
      predictions into. Default is 10 bins such that the 1st bin contains the
      highest 10% of the predictions, 2nd bin contains the highest 20% of the
      predictions and so on.
      decimal_points: Number of decimal points to use when outputting the
        calculated performance metrics.

  Returns:
    bin_metrics: Following metrics calculated for each cumulative bin.
      cumulative_bin_number: Bin number starting from 1.
      bin_size: Total numbers of instances in the bin,
      bin_size_proportion: Proportion of instances in the bin out of all the
        instances in the labels.
      positive_instances: Numbers of positive instances in the bin,
      precision: Proportion of positive instances out of all the instances
        in the bin,
      coverage (recall): Proportion of positives instances in the bin out of
        all the positive instances in the labels,
      prop_label_positives: Proportion of positive instances in the labels,
      precision_uplift: Uplift of precision of the bin compared to the
        precision of the random prediction (prop_label_positives).
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    # Separate the probability_predictions into bins.
    label_predictions = pd.DataFrame(list(zip(labels,
                                              probability_predictions)),
                                     columns=['label', 'prediction'])
    label_predictions = label_predictions.sort_values(by='prediction',
                                                      ascending=False)
    number_total_instances = label_predictions.shape[0]
    equal_bin_size = number_total_instances / number_bins
    number_total_positive_instances = label_predictions[
        label_predictions['label'] > 0].shape[0]
    prop_label_positives = round(
        number_total_positive_instances / number_total_instances,
        decimal_points)

    cumulative_bin_metrics_list = list()

    for i in range(1, (number_bins + 1)):
        current_bin_size = round(equal_bin_size * i)
        bin_size_proportion = round(current_bin_size / number_total_instances,
                                    decimal_points)
        bin_instances = label_predictions.head(current_bin_size)
        number_bin_positive_instances = bin_instances[
            bin_instances['label'] > 0].shape[0]
        bin_precision = round(number_bin_positive_instances / current_bin_size,
                              decimal_points)
        bin_recall = round(
            number_bin_positive_instances / number_total_positive_instances,
            decimal_points)
        bin_precision_uplift = round(bin_precision / prop_label_positives,
                                     decimal_points)

        cumulative_bin_metrics_list.append(
            (i, current_bin_size, bin_size_proportion,
             number_bin_positive_instances, bin_precision, bin_recall,
             prop_label_positives, bin_precision_uplift))

    return pd.DataFrame(cumulative_bin_metrics_list,
                        columns=[
                            'cumulative_bin_number', 'bin_size',
                            'bin_size_proportion', 'positive_instances',
                            'precision', 'coverage (recall)',
                            'prop_label_positives', 'precision_uplift'
                        ])
コード例 #14
0
def calc_bin_metrics(labels: np.ndarray,
                     probability_predictions: np.ndarray,
                     number_bins: Optional[int] = 10,
                     decimal_points: Optional[int] = 4) -> pd.DataFrame:
    """Calculates performance metrics for each bin of the predictions.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    number_bins: Number of bins that we want to divide the ranked predictions
      into. Default is deciles (10 bins) such that the 1st bin contains the
      highest 10% of the predictions, the 2nd bin contains the next 10% of the
      predictions and so on.
      decimal_points: Number of decimal points to use when outputting the
        calculated performance metrics.

  Returns:
    bin_metrics: Following metrics calculated for each bin.
      bin_number: Bin number starting from 1.
      bin_size: Total numbers of instances in the bin,
      positive_instances: Numbers of positive instances in the bin,
      precision: Proportion of positive instances out of all the instances
        in the bin,
      coverage: Proportion of positives instances out of all the positive
        instances in the dataset
      prop_positives: Proportion of positive instances in the label,
      precision_uplift: Uplift of precision compared to the precision
        of the random prediction (prop_positives).
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    # Separate the probability_predictions into bins.
    bins = pd.qcut(probability_predictions, q=number_bins, labels=False)
    binned_data = pd.DataFrame(list(zip(labels, probability_predictions,
                                        bins)),
                               columns=['label', 'prediction', 'bin_number'])

    # Calculate the metrics for each bin.
    total_instances = (binned_data[['bin_number',
                                    'label']].groupby('bin_number').count())
    total_instances.columns = ['bin_size']
    total_instances = total_instances.reset_index()
    positive_instances = (binned_data.loc[binned_data['label'] > 0][[
        'bin_number', 'label'
    ]].groupby('bin_number').count())
    positive_instances.columns = ['positive_instances']
    positive_instances = positive_instances.reset_index()

    bin_metrics = pd.merge(total_instances,
                           positive_instances,
                           on='bin_number',
                           how='left')
    bin_metrics.fillna(0, inplace=True)
    bin_metrics['precision'] = (bin_metrics['positive_instances'] /
                                bin_metrics['bin_size'])
    bin_metrics['precision'] = [
        round(val, decimal_points) for val in bin_metrics['precision']
    ]
    prop_positives = round(labels[labels == 1.0].shape[0] / len(labels),
                           decimal_points)
    bin_metrics['prop_positives'] = prop_positives
    # Convert bin_number from zero-based offset to 1-based offset.
    bin_metrics['bin_number'] = bin_metrics['bin_number'] + 1
    bin_metrics['precision_uplift'] = bin_metrics['precision'] / prop_positives
    bin_metrics['precision_uplift'] = [
        round(val, decimal_points) for val in bin_metrics['precision_uplift']
    ]
    bin_metrics['coverage'] = (bin_metrics['positive_instances'] /
                               sum(bin_metrics['positive_instances']))
    bin_metrics['coverage'] = [
        round(val, decimal_points) for val in bin_metrics['coverage']
    ]

    # Reverse the order of bin numbers such that bin 1 has the highest
    # predicted probability.
    bin_metrics['bin_number'] = number_bins - bin_metrics['bin_number'] + 1
    bin_metrics = bin_metrics.sort_values(['bin_number'
                                           ]).reset_index(drop=True)

    return bin_metrics
コード例 #15
0
 def test_assert_label_and_prediction_length_match_right_error(self):
     with self.assertRaises(AssertionError):
         utils.assert_label_and_prediction_length_match(
             np.array([0.0, 0.0, 1.0, 1.0]), np.array([0.0, 0.5, 0.33]))
コード例 #16
0
def calc_chisquared_sample_sizes_for_cumulative_bins(
    labels: np.ndarray,
    probability_predictions: np.ndarray,
    number_bins: int = 10,
    uplift_percentages: Sequence[np.float64] = (10, 20),
    power_percentages: Sequence[np.float64] = (80, 90),
    confidence_level_percentages: Sequence[np.float64] = (90, 95)
) -> pd.DataFrame:
    """Calculates statistical sample sizes for the cumulative bins of predictions.

  These sample sizes for the cumulative bins of predicted probabilities are
  estimated using the Chi-squared test of proportions for each combination of
  uplift_percentage, power_percentage and confidence_level_percentage. These
  sizes could be used as the minimum required sizes for each Test or Control
  group when designing an experiment to target users having the top X% of
  predicted probabilities.

  Args:
    labels: An array of true binary labels represented by 1.0 and 0.0.
    probability_predictions: An array of predicted probabilities between 0.0 and
      1.0.
    number_bins: Number of cumulative bins that we want to divide the ranked
      predictions into. Default is deciles (10 bins) such that the 1st bin
      contains the highest 10% of the predictions, the 2nd bin contains the
      highest 20% of the predictions and so on.
    uplift_percentages: Sequence of different expected uplift percentages.
    power_percentages: Sequence of different statistical powers for the test.
    confidence_level_percentages: Sequence of different statistical confidence
      levels for the test.

  Returns:
    bin_metrics: Following metrics calculated for each cumulative bin.
      cumulative_bin_number: Bin number starting from 1.
      bin_size: Total numbers of instances in the bin.
      bin_size_percentage: Percentage of instances in the bin out of all the
        instances in the labels.
      min_probability: Minimum predicted probability within the bin.
      conversion_rate: Proportion of positive instances out of all the instances
        in the bin (precision).
      expected_uplift: Expected uplift_percentage.
      power_percentage: Statistical power of the test.
      confidence_level_percentage: Statistical confidence level of the test.
      required_sample_size: Statistical sample size required.
  """
    utils.assert_label_values_are_valid(labels)
    utils.assert_prediction_values_are_valid(probability_predictions)
    utils.assert_label_and_prediction_length_match(labels,
                                                   probability_predictions)

    # Separate the probability_predictions into bins.
    label_predictions = pd.DataFrame(list(zip(labels,
                                              probability_predictions)),
                                     columns=['label', 'prediction'])
    label_predictions = label_predictions.sort_values(by='prediction',
                                                      ascending=False)
    number_total_instances = label_predictions.shape[0]
    equal_bin_size = number_total_instances / number_bins

    cumulative_bin_metrics_list = []

    for bin_number in range(1, (number_bins + 1)):
        current_bin_size = round(equal_bin_size * bin_number)
        bin_size_percentage = round(
            current_bin_size / number_total_instances * 100, 2)
        bin_instances = label_predictions.head(current_bin_size)
        positive_instance_indeces = bin_instances['label'] > 0.0
        number_bin_positive_instances = bin_instances[
            positive_instance_indeces].shape[0]
        conv_rate = round(
            number_bin_positive_instances / current_bin_size * 100, 2)
        min_prob = min(bin_instances['prediction'])

        for uplift_percentage in uplift_percentages:
            for power_percentage in power_percentages:
                for confidence_level_percentage in confidence_level_percentages:
                    sample_size = calc_chisquared_sample_size(
                        conv_rate, uplift_percentage, power_percentage,
                        confidence_level_percentage)
                    cumulative_bin_metrics_list.append(
                        (bin_number, current_bin_size, bin_size_percentage,
                         min_prob, conv_rate, uplift_percentage,
                         power_percentage, confidence_level_percentage,
                         sample_size))

    return pd.DataFrame(cumulative_bin_metrics_list,
                        columns=[
                            'cumulative_bin_number', 'bin_size',
                            'bin_size_percentage', 'min_probability',
                            'conv_rate_percentage', 'uplift_percentage',
                            'power_percentage', 'confidence_level_percentage',
                            'required_sample_size'
                        ])
コード例 #17
0
def calc_performance_metrics(
        labels: np.ndarray,
        predictions: np.ndarray,
        pred_probs: np.ndarray,
        class_label_names: Optional[Dict[Union[str, int], Union[str,
                                                                int]]] = None,
        decimal_points: Optional[int] = 3,
        average_type: Optional[str] = 'weighted',
        multi_class_type: Optional[str] = 'ovr') -> Dict[str, float]:
    """Calculates performance metrics for a multiclass classification model.

  Args:
    labels: An array of true labels containing multiclass labels.
    predictions: An array of predictions containing multiclass labels.
    pred_probs: An array of shape (n_samples, n_classes) of predicted
      probabilities.
    class_label_names: Optional. Dictionary of multiclass labels and
      corresponding target names. The type of both class lable and target names
      can be either 'int' or 'str'. E.g. {0: 'low_value', 1: 'mid_value', 2:
      'high_value'}.
    decimal_points: Number of decimal points to use when outputting the
      calculated evaluation metrics.
    average_type: The averaging method applied to the data while calculating
      scores.
    multi_class_type: The method applied to AUC calculation. It can take 'ovr'
      or 'ovo'. 'ovr' stands for One-vs-rest. 'ovo' stands for One-vs-one.

  Returns:
    Dictionary of evaluation metrics of a multiclass classification model:
    {classification_report: Summary report of precision, recall, F1 score for
    each class.
    auc_roc_score: Area under the recall vs (1-specificity) (ROC) curve.
    confusion_matrix: Confusion matrix to evaluate the accuracy of each class.
  """
    utils.assert_label_and_prediction_length_match(labels, predictions)
    assert len(labels) == pred_probs.shape[0], (
        'The true labels and prediction probability should have the same length.'
    )
    assert len(set(labels)) == pred_probs.shape[1], (
        'The number of classes of labels and prediction probability should be '
        'the same.')
    if class_label_names is None:
        class_labels = list(set(labels))
        target_names = ['%s' % l for l in class_labels]
    else:
        class_labels = list(class_label_names.keys())
        target_names = list(class_label_names.values())

    class_report = classification_report(y_true=labels,
                                         y_pred=predictions,
                                         labels=class_labels,
                                         target_names=target_names)
    auc_score = roc_auc_score(y_true=labels,
                              y_score=pred_probs,
                              average=average_type,
                              multi_class=multi_class_type)

    conf_matrix = confusion_matrix(y_true=labels,
                                   y_pred=predictions,
                                   labels=class_labels)

    return {
        'classification_report': class_report,
        'auc_roc_score': round(auc_score, decimal_points),
        'confusion_matrix': conf_matrix
    }