def plot_predicted_probabilities(labels: np.ndarray, probability_predictions: np.ndarray, colors: Optional[Sequence[str]] = ('b', 'g'), print_stats: bool = True, fig_width: Optional[int] = 20, fig_height: Optional[int] = 15) -> axes.Axes: """Plots the distributions of predicted probabilities for each class. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. colors: Colors for the probability plots. print_stats: Flag that whether the stats of probabilities are plotted fig_width: Width of the figure. fig_height: Height of the figure. Returns: plots: Class density plots of the predicted probabilities. """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) assert len(np.unique(labels)) == len(colors),\ 'number of colors should be the same as number of unique labels.' unique_labels = np.sort(np.unique(labels)) _, plots = pyplot.subplots(figsize=(fig_width, fig_height)) for color, label in zip(colors, unique_labels): index_plot = np.where(labels == label)[0] preds_plot = probability_predictions[index_plot] label_plot = 'class[%s]' % (str(label)) if print_stats: label_plot += ': mean=%.4f, std=%.4f, median=%.4f' % ( np.mean(preds_plot), np.std(preds_plot), np.median(preds_plot)) sns.kdeplot(preds_plot, shade=True, color=color, label=label_plot, ax=plots) pyplot.title('Distribution of predicted probabilities') pyplot.legend() pyplot.xlabel('Probability') pyplot.ylabel('Density') pyplot.xlim([0, 1]) return plots
def plot_precision_recall_curve( labels: np.ndarray, probability_predictions: np.ndarray, print_stats: bool = True, fig_width: Optional[int] = 8, fig_height: Optional[int] = 8, curve_color: Optional[str] = 'blue') -> axes.Axes: """Plots the Precision-Recall curve for the predictions. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. print_stats: Flag that whether the Average Precision is plotted. fig_width: Width of the figure. fig_height: Height of the figure. curve_color: Color of the Precision-Recall curve. Returns: plots: Class density plots of the ROC curve. """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) precision, recall, _ = sklearn.metrics.precision_recall_curve( labels, probability_predictions) _, plots = pyplot.subplots(figsize=(fig_width, fig_height)) pyplot.plot(recall, precision, marker='.', label='Precision-Recall', color=curve_color) pyplot.xlabel('Recall') pyplot.ylabel('Precision') pyplot.legend() if print_stats: pyplot.title('Average Precision=%.4f' % sklearn.metrics.average_precision_score( labels, probability_predictions)) return plots
def calc_chisquared_sample_sizes_for_bins( labels: np.ndarray, probability_predictions: np.ndarray, number_bins: int = 3, uplift_percentages: Sequence[np.float64] = (10, 20), power_percentages: Sequence[np.float64] = (80, 90), confidence_level_percentages: Sequence[np.float64] = (90, 95) ) -> pd.DataFrame: """Calculates statistical sample sizes for the bins defined on predictions. These sample sizes for the bins defined on the predicted probabilities are estimated using the Chi-squared test of proportions for each combination of uplift_percentage, power_percentage and confidence_level_percentage. These sizes could be used as the minimum required size for each Test or Control group when designing an experiment to target users from each of these bins of predictions. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. number_bins: Number of bins that we want to divide the ranked predictions into. Default is 3 bins such that the 1st bin contains the highest 1/3rd of the predictions (High Propensity group), the 2nd bin contains the next 1/3rd of the predictions (Medium Propensity group) and the last bin contains the lowest 1/3rd of the predictions (Lowest Propensity group). uplift_percentages: Sequence of different expected uplift percentages. power_percentages: Sequence of different statistical powers for the test. confidence_level_percentages: Sequence of different statistical confidence levels for the test. Returns: bin_metrics: Following metrics calculated for each bin of the predictions. bin_number: Bin number starting from 1. bin_size: Total numbers of instances in the bin. min_probability: Minimum predicted probability within the bin. conversion_rate: Proportion of positive instances out of all the instances in the bin (precision). expected_uplift: Expected uplift_percentage. power_percentage: Statistical power of the test. confidence_level_percentage: Statistical confidence level of the test. required_sample_size: Statistical sample size required. """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) # separate the probability_predictions into bins of equal size binned_data = pd.DataFrame(list(zip(labels, probability_predictions)), columns=['label', 'prediction']) binned_data = binned_data.sort_values('prediction').reset_index() # to avoid duplicate edges of bins use the index in the qcat function below binned_data['bin_number'] = pd.qcut(binned_data.index, q=number_bins, labels=False) # calculate the conversion rate for each bin total_instances = (binned_data[['bin_number', 'label']].groupby('bin_number').count()) total_instances.columns = ['bin_size'] total_instances = total_instances.reset_index() positive_instances = (binned_data.loc[binned_data['label'] > 0][[ 'bin_number', 'label' ]].groupby('bin_number').count()) positive_instances.columns = ['positive_instances'] positive_instances = positive_instances.reset_index() bin_conv_rate = pd.merge(total_instances, positive_instances, on='bin_number', how='left') bin_conv_rate.fillna(0, inplace=True) bin_conv_rate['conversion_rate'] = round( (bin_conv_rate['positive_instances'] / bin_conv_rate['bin_size'] * 100), 2) bin_metrics_list = list() for bin_number in bin_conv_rate['bin_number']: conv_rate = bin_conv_rate['conversion_rate'][bin_number] bin_size = bin_conv_rate['bin_size'][bin_number] min_prob = min( binned_data[binned_data['bin_number'] == bin_number]['prediction']) for uplift_percentage in uplift_percentages: for power_percentage in power_percentages: for confidence_level_percentage in confidence_level_percentages: sample_size = calc_chisquared_sample_size( conv_rate, uplift_percentage, power_percentage, confidence_level_percentage) bin_metrics_list.append( (bin_number, bin_size, min_prob, conv_rate, uplift_percentage, power_percentage, confidence_level_percentage, sample_size)) bin_metrics = pd.DataFrame(bin_metrics_list, columns=[ 'bin_number', 'bin_size', 'min_probability', 'conv_rate_percentage', 'uplift_percentage', 'power_percentage', 'confidence_level_percentage', 'required_sample_size' ]) # reverse the order of bin numbers such that bin 1 has the highest # predicted probability bin_metrics['bin_number'] = number_bins - bin_metrics['bin_number'] bin_metrics = bin_metrics.sort_values(['bin_number' ]).reset_index(drop=True) return bin_metrics
def calc_chisquared_sample_sizes_for_cumulative_bins( labels: np.ndarray, probability_predictions: np.ndarray, number_bins: int = 10, uplift_percentages: Sequence[np.float64] = (10, 20), power_percentages: Sequence[np.float64] = (80, 90), confidence_level_percentages: Sequence[np.float64] = (90, 95) ) -> pd.DataFrame: """Calculates statistical sample sizes for the cumulative bins of predictions. These sample sizes for the cumulative bins of predicted probabilities are estimated using the Chi-squared test of proportions for each combination of uplift_percentage, power_percentage and confidence_level_percentage. These sizes could be used as the minimum required sizes for each Test or Control group when designing an experiment to target users having the top X% of predicted probabilities. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. number_bins: Number of cumulative bins that we want to divide the ranked predictions into. Default is deciles (10 bins) such that the 1st bin contains the highest 10% of the predictions, the 2nd bin contains the highest 20% of the predictions and so on. uplift_percentages: Sequence of different expected uplift percentages. power_percentages: Sequence of different statistical powers for the test. confidence_level_percentages: Sequence of different statistical confidence levels for the test. Returns: bin_metrics: Following metrics calculated for each cumulative bin. cumulative_bin_number: Bin number starting from 1. bin_size: Total numbers of instances in the bin. bin_size_percentage: Percentage of instances in the bin out of all the instances in the labels. min_probability: Minimum predicted probability within the bin. conversion_rate: Proportion of positive instances out of all the instances in the bin (precision). expected_uplift: Expected uplift_percentage. power_percentage: Statistical power of the test. confidence_level_percentage: Statistical confidence level of the test. required_sample_size: Statistical sample size required. """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) # Separate the probability_predictions into bins. label_predictions = pd.DataFrame(list(zip(labels, probability_predictions)), columns=['label', 'prediction']) label_predictions = label_predictions.sort_values(by='prediction', ascending=False) number_total_instances = label_predictions.shape[0] equal_bin_size = number_total_instances / number_bins cumulative_bin_metrics_list = [] for bin_number in range(1, (number_bins + 1)): current_bin_size = round(equal_bin_size * bin_number) bin_size_percentage = round( current_bin_size / number_total_instances * 100, 2) bin_instances = label_predictions.head(current_bin_size) positive_instance_indeces = bin_instances['label'] > 0.0 number_bin_positive_instances = bin_instances[ positive_instance_indeces].shape[0] conv_rate = round( number_bin_positive_instances / current_bin_size * 100, 2) min_prob = min(bin_instances['prediction']) for uplift_percentage in uplift_percentages: for power_percentage in power_percentages: for confidence_level_percentage in confidence_level_percentages: sample_size = calc_chisquared_sample_size( conv_rate, uplift_percentage, power_percentage, confidence_level_percentage) cumulative_bin_metrics_list.append( (bin_number, current_bin_size, bin_size_percentage, min_prob, conv_rate, uplift_percentage, power_percentage, confidence_level_percentage, sample_size)) return pd.DataFrame(cumulative_bin_metrics_list, columns=[ 'cumulative_bin_number', 'bin_size', 'bin_size_percentage', 'min_probability', 'conv_rate_percentage', 'uplift_percentage', 'power_percentage', 'confidence_level_percentage', 'required_sample_size' ])
def calc_performance_metrics( labels: np.ndarray, probability_predictions: np.ndarray, binarize_threshold: Optional[float] = None, decimal_points: Optional[int] = 4, ) -> Dict[str, float]: """Calculates performance metrics related to a binary classification model. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. binarize_threshold: Probability threshold to be used to binarize the predicted probabilities. By default the proportion of positive instances in the labels is used. decimal_points: Number of decimal points to use when outputting the calculated performance metrics. Returns: metrics: Dictionary of the following performance metric {prop_positives: Proportion of instances where label = 1.0, auc_roc: Area under the recall vs (1-specificity) (ROC) curve, auc_pr: Area under the recall vs precision (ROC) curve. Following metrics are calculated after binarizing the predicted probabilities based on the given binarize_threshold, accuracy: Total accuracy of the predictions, true_positive_rate (recall or sensitivity): True positive rate, true_negative_rate (specificity): True negative rate, precision: Precision (confidence) of the true positive predictions, f1_score: F1 score of sensitivity and specificity, precision_uplift: Uplift of the precision compared to random prediction} """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) num_positives = labels.sum() prop_positives = float(num_positives) / len(labels) if binarize_threshold is None: binarize_threshold = prop_positives # Calculate auc metrics. auc_roc = round( sklearn.metrics.roc_auc_score(labels, probability_predictions), decimal_points) auc_pr = round( sklearn.metrics.average_precision_score(labels, probability_predictions), decimal_points) # Binarize the predictions. binarized_predictions = ((probability_predictions > binarize_threshold).astype(int)) # Calculate metrics based on binarized predictions. accuracy = sklearn.metrics.accuracy_score(labels, binarized_predictions) tp_rate = sklearn.metrics.recall_score(labels, binarized_predictions, pos_label=1) tn_rate = sklearn.metrics.recall_score(labels, binarized_predictions, pos_label=0) precision = sklearn.metrics.precision_score(labels, binarized_predictions) f1_score = sklearn.metrics.f1_score(labels, binarized_predictions) return { 'prop_positives': round(prop_positives, decimal_points), 'auc_roc': round(auc_roc, decimal_points), 'auc_pr': round(auc_pr, decimal_points), 'binarize_threshold': round(binarize_threshold, decimal_points), 'accuracy': round(accuracy, decimal_points), 'true_positive_rate': round(tp_rate, decimal_points), 'true_negative_rate': round(tn_rate, decimal_points), 'precision': round(precision, decimal_points), 'f1_score': round(f1_score, decimal_points) }
def calc_cumulative_bin_metrics( labels: np.ndarray, probability_predictions: np.ndarray, number_bins: int = 10, decimal_points: Optional[int] = 4) -> pd.DataFrame: """Calculates performance metrics for cumulative bins of the predictions. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. number_bins: Number of cumulative bins that we want to divide the ranked predictions into. Default is 10 bins such that the 1st bin contains the highest 10% of the predictions, 2nd bin contains the highest 20% of the predictions and so on. decimal_points: Number of decimal points to use when outputting the calculated performance metrics. Returns: bin_metrics: Following metrics calculated for each cumulative bin. cumulative_bin_number: Bin number starting from 1. bin_size: Total numbers of instances in the bin, bin_size_proportion: Proportion of instances in the bin out of all the instances in the labels. positive_instances: Numbers of positive instances in the bin, precision: Proportion of positive instances out of all the instances in the bin, coverage (recall): Proportion of positives instances in the bin out of all the positive instances in the labels, prop_label_positives: Proportion of positive instances in the labels, precision_uplift: Uplift of precision of the bin compared to the precision of the random prediction (prop_label_positives). """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) # Separate the probability_predictions into bins. label_predictions = pd.DataFrame(list(zip(labels, probability_predictions)), columns=['label', 'prediction']) label_predictions = label_predictions.sort_values(by='prediction', ascending=False) number_total_instances = label_predictions.shape[0] equal_bin_size = number_total_instances / number_bins number_total_positive_instances = label_predictions[ label_predictions['label'] > 0].shape[0] prop_label_positives = round( number_total_positive_instances / number_total_instances, decimal_points) cumulative_bin_metrics_list = list() for i in range(1, (number_bins + 1)): current_bin_size = round(equal_bin_size * i) bin_size_proportion = round(current_bin_size / number_total_instances, decimal_points) bin_instances = label_predictions.head(current_bin_size) number_bin_positive_instances = bin_instances[ bin_instances['label'] > 0].shape[0] bin_precision = round(number_bin_positive_instances / current_bin_size, decimal_points) bin_recall = round( number_bin_positive_instances / number_total_positive_instances, decimal_points) bin_precision_uplift = round(bin_precision / prop_label_positives, decimal_points) cumulative_bin_metrics_list.append( (i, current_bin_size, bin_size_proportion, number_bin_positive_instances, bin_precision, bin_recall, prop_label_positives, bin_precision_uplift)) return pd.DataFrame(cumulative_bin_metrics_list, columns=[ 'cumulative_bin_number', 'bin_size', 'bin_size_proportion', 'positive_instances', 'precision', 'coverage (recall)', 'prop_label_positives', 'precision_uplift' ])
def calc_bin_metrics(labels: np.ndarray, probability_predictions: np.ndarray, number_bins: Optional[int] = 10, decimal_points: Optional[int] = 4) -> pd.DataFrame: """Calculates performance metrics for each bin of the predictions. Args: labels: An array of true binary labels represented by 1.0 and 0.0. probability_predictions: An array of predicted probabilities between 0.0 and 1.0. number_bins: Number of bins that we want to divide the ranked predictions into. Default is deciles (10 bins) such that the 1st bin contains the highest 10% of the predictions, the 2nd bin contains the next 10% of the predictions and so on. decimal_points: Number of decimal points to use when outputting the calculated performance metrics. Returns: bin_metrics: Following metrics calculated for each bin. bin_number: Bin number starting from 1. bin_size: Total numbers of instances in the bin, positive_instances: Numbers of positive instances in the bin, precision: Proportion of positive instances out of all the instances in the bin, coverage: Proportion of positives instances out of all the positive instances in the dataset prop_positives: Proportion of positive instances in the label, precision_uplift: Uplift of precision compared to the precision of the random prediction (prop_positives). """ utils.assert_label_values_are_valid(labels) utils.assert_prediction_values_are_valid(probability_predictions) utils.assert_label_and_prediction_length_match(labels, probability_predictions) # Separate the probability_predictions into bins. bins = pd.qcut(probability_predictions, q=number_bins, labels=False) binned_data = pd.DataFrame(list(zip(labels, probability_predictions, bins)), columns=['label', 'prediction', 'bin_number']) # Calculate the metrics for each bin. total_instances = (binned_data[['bin_number', 'label']].groupby('bin_number').count()) total_instances.columns = ['bin_size'] total_instances = total_instances.reset_index() positive_instances = (binned_data.loc[binned_data['label'] > 0][[ 'bin_number', 'label' ]].groupby('bin_number').count()) positive_instances.columns = ['positive_instances'] positive_instances = positive_instances.reset_index() bin_metrics = pd.merge(total_instances, positive_instances, on='bin_number', how='left') bin_metrics.fillna(0, inplace=True) bin_metrics['precision'] = (bin_metrics['positive_instances'] / bin_metrics['bin_size']) bin_metrics['precision'] = [ round(val, decimal_points) for val in bin_metrics['precision'] ] prop_positives = round(labels[labels == 1.0].shape[0] / len(labels), decimal_points) bin_metrics['prop_positives'] = prop_positives # Convert bin_number from zero-based offset to 1-based offset. bin_metrics['bin_number'] = bin_metrics['bin_number'] + 1 bin_metrics['precision_uplift'] = bin_metrics['precision'] / prop_positives bin_metrics['precision_uplift'] = [ round(val, decimal_points) for val in bin_metrics['precision_uplift'] ] bin_metrics['coverage'] = (bin_metrics['positive_instances'] / sum(bin_metrics['positive_instances'])) bin_metrics['coverage'] = [ round(val, decimal_points) for val in bin_metrics['coverage'] ] # Reverse the order of bin numbers such that bin 1 has the highest # predicted probability. bin_metrics['bin_number'] = number_bins - bin_metrics['bin_number'] + 1 bin_metrics = bin_metrics.sort_values(['bin_number' ]).reset_index(drop=True) return bin_metrics
def test_assert_label_values_are_valid_raises_right_error(self): with self.assertRaises(AssertionError): utils.assert_label_values_are_valid( np.array([1.0, 1.0, 1.0, 0.0, 0.0, 4.0]))