Example #1
0
    def test_get_contingency_table(self):
        """Ensures correct output from get_contingency_table."""

        this_contingency_table = model_eval.get_contingency_table(
            FORECAST_LABELS_THRESHOLD_HALF, OBSERVED_LABELS)
        self.assertTrue(
            this_contingency_table == CONTINGENCY_TABLE_THRESHOLD_HALF)
Example #2
0
def run_evaluation(class_probability_matrix, observed_labels, output_dir_name):
    """Evaluates a set of multiclass probabilistic predictions.

    E = number of examples
    K = number of classes

    :param class_probability_matrix: E-by-K numpy array, where
        class_probability_matrix[i, k] = probability that the [i]th example
        belongs to the [k]th class.  Classes should be mutually exclusive and
        collectively exhaustive, so that the sum across each row is 1.0.
    :param observed_labels: length-E numpy array of observed labels.  Each label
        must be an integer from 0...(K - 1).
    :param output_dir_name: Name of output directory.  Results will be saved
        here.
    """

    file_system_utils.mkdir_recursive_if_necessary(
        directory_name=output_dir_name)

    print 'Finding best binarization threshold (front vs. no front)...'

    binarization_threshold, best_gerrity_score = (
        eval_utils.find_best_binarization_threshold(
            class_probability_matrix=class_probability_matrix,
            observed_labels=observed_labels,
            threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
            criterion_function=eval_utils.get_gerrity_score,
            optimization_direction=eval_utils.MAX_OPTIMIZATION_DIRECTION,
            forecast_precision_for_thresholds=FORECAST_PRECISION_FOR_THRESHOLDS
        ))

    print(
        'Best binarization threshold = {0:.4f} ... corresponding Gerrity score '
        '= {1:.4f}').format(binarization_threshold, best_gerrity_score)

    print 'Determinizing multiclass probabilities...'
    predicted_labels = eval_utils.determinize_probabilities(
        class_probability_matrix=class_probability_matrix,
        binarization_threshold=binarization_threshold)

    contingency_matrix = eval_utils.get_contingency_table(
        predicted_labels=predicted_labels,
        observed_labels=observed_labels,
        num_classes=class_probability_matrix.shape[1])

    print 'Multiclass contingency table is shown below:\n{0:s}'.format(
        str(contingency_matrix))
    print SEPARATOR_STRING

    accuracy = eval_utils.get_accuracy(contingency_matrix)
    peirce_score = eval_utils.get_peirce_score(contingency_matrix)
    heidke_score = eval_utils.get_heidke_score(contingency_matrix)
    gerrity_score = eval_utils.get_gerrity_score(contingency_matrix)

    print(
        'Multiclass accuracy = {0:.4f} ... Peirce score = {1:.4f} ... '
        'Heidke score = {2:.4f} ... Gerrity score = {3:.4f}\n').format(
            accuracy, peirce_score, heidke_score, gerrity_score)

    binary_contingency_dict = model_eval.get_contingency_table(
        forecast_labels=(predicted_labels > 0).astype(int),
        observed_labels=(observed_labels > 0).astype(int))

    print 'Binary contingency table is shown below:\n{0:s}'.format(
        str(binary_contingency_dict))
    print SEPARATOR_STRING

    binary_pod = model_eval.get_pod(binary_contingency_dict)
    binary_pofd = model_eval.get_pofd(binary_contingency_dict)
    binary_success_ratio = model_eval.get_success_ratio(
        binary_contingency_dict)
    binary_focn = model_eval.get_focn(binary_contingency_dict)
    binary_accuracy = model_eval.get_accuracy(binary_contingency_dict)
    binary_csi = model_eval.get_csi(binary_contingency_dict)
    binary_frequency_bias = model_eval.get_frequency_bias(
        binary_contingency_dict)

    print(
        'Binary POD = {0:.4f} ... POFD = {1:.4f} ... success ratio = {2:.4f} '
        '... FOCN = {3:.4f} ... accuracy = {4:.4f} ... CSI = {5:.4f} ... '
        'frequency bias = {6:.4f}\n').format(binary_pod, binary_pofd,
                                             binary_success_ratio, binary_focn,
                                             binary_accuracy, binary_csi,
                                             binary_frequency_bias)

    auc_by_class, sklearn_auc_by_class = _plot_roc_curves(
        class_probability_matrix=class_probability_matrix,
        observed_labels=observed_labels,
        output_dir_name=output_dir_name)
    print '\n'

    aupd_by_class = _plot_performance_diagrams(
        class_probability_matrix=class_probability_matrix,
        observed_labels=observed_labels,
        output_dir_name=output_dir_name)
    print '\n'

    reliability_by_class, bss_by_class = _plot_attributes_diagrams(
        class_probability_matrix=class_probability_matrix,
        observed_labels=observed_labels,
        output_dir_name=output_dir_name)
    print '\n'

    evaluation_file_name = '{0:s}/model_evaluation.p'.format(output_dir_name)
    print 'Writing results to: "{0:s}"...\n'.format(evaluation_file_name)

    eval_utils.write_evaluation_results(
        class_probability_matrix=class_probability_matrix,
        observed_labels=observed_labels,
        binarization_threshold=binarization_threshold,
        accuracy=accuracy,
        peirce_score=peirce_score,
        heidke_score=heidke_score,
        gerrity_score=gerrity_score,
        binary_pod=binary_pod,
        binary_pofd=binary_pofd,
        binary_success_ratio=binary_success_ratio,
        binary_focn=binary_focn,
        binary_accuracy=binary_accuracy,
        binary_csi=binary_csi,
        binary_frequency_bias=binary_frequency_bias,
        auc_by_class=auc_by_class,
        scikit_learn_auc_by_class=sklearn_auc_by_class,
        aupd_by_class=aupd_by_class,
        reliability_by_class=reliability_by_class,
        bss_by_class=bss_by_class,
        pickle_file_name=evaluation_file_name)
def _compute_scores(forecast_probabilities,
                    observed_labels,
                    num_bootstrap_reps,
                    output_file_name,
                    best_prob_threshold=None,
                    downsampling_dict=None):
    """Computes evaluation scores.

    E = number of examples (storm objects)

    :param forecast_probabilities: length-E numpy array of forecast event
        probabilities.
    :param observed_labels: length-E numpy array of observations (1 for event,
        0 for non-event).
    :param num_bootstrap_reps: Number of bootstrap replicates.
    :param output_file_name: Path to output file (will be written by
        `model_evaluation.write_evaluation`).
    :param best_prob_threshold: Best probability threshold.  If None, will be
        determined on the fly.
    :param downsampling_dict: Dictionary with downsampling fractions.  See doc
        for `deep_learning_utils.sample_by_class`.  If this is None,
        downsampling will not be used.
    """

    num_examples = len(observed_labels)
    num_examples_by_class = numpy.unique(observed_labels,
                                         return_counts=True)[-1]

    print('Number of examples by class (no downsampling): {0:s}'.format(
        str(num_examples_by_class)))

    positive_example_indices = numpy.where(observed_labels == 1)[0]
    negative_example_indices = numpy.where(observed_labels == 0)[0]

    if downsampling_dict is None:
        these_indices = numpy.linspace(0,
                                       num_examples - 1,
                                       num=num_examples,
                                       dtype=int)
    else:
        these_indices = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=downsampling_dict,
            target_name=DUMMY_TARGET_NAME,
            target_values=observed_labels,
            num_examples_total=num_examples)

        this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                            return_counts=True)[-1]

        print('Number of examples by class (after downsampling): {0:s}'.format(
            str(this_num_ex_by_class)))

    all_prob_thresholds = model_eval.get_binarization_thresholds(
        threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
        forecast_probabilities=forecast_probabilities[these_indices],
        forecast_precision=FORECAST_PRECISION)

    if best_prob_threshold is None:
        best_prob_threshold, best_csi = (
            model_eval.find_best_binarization_threshold(
                forecast_probabilities=forecast_probabilities[these_indices],
                observed_labels=observed_labels[these_indices],
                threshold_arg=all_prob_thresholds,
                criterion_function=model_eval.get_csi,
                optimization_direction=model_eval.MAX_OPTIMIZATION_STRING))
    else:
        these_forecast_labels = model_eval.binarize_forecast_probs(
            forecast_probabilities=forecast_probabilities[these_indices],
            binarization_threshold=best_prob_threshold)

        this_contingency_dict = model_eval.get_contingency_table(
            forecast_labels=these_forecast_labels,
            observed_labels=observed_labels[these_indices])

        best_csi = model_eval.get_csi(this_contingency_dict)

    print(
        ('Best probability threshold = {0:.4f} ... corresponding CSI = {1:.4f}'
         ).format(best_prob_threshold, best_csi))

    num_examples_by_forecast_bin = model_eval.get_points_in_reliability_curve(
        forecast_probabilities=forecast_probabilities[these_indices],
        observed_labels=observed_labels[these_indices],
        num_forecast_bins=model_eval.DEFAULT_NUM_RELIABILITY_BINS)[-1]

    list_of_evaluation_tables = []

    for i in range(num_bootstrap_reps):
        print(('Computing scores for {0:d}th of {1:d} bootstrap replicates...'
               ).format(i + 1, num_bootstrap_reps))

        if num_bootstrap_reps == 1:
            if downsampling_dict is None:
                these_indices = numpy.linspace(0,
                                               num_examples - 1,
                                               num=num_examples,
                                               dtype=int)
            else:
                these_indices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels,
                    num_examples_total=num_examples)
        else:
            if len(positive_example_indices) > 0:
                these_positive_indices = bootstrapping.draw_sample(
                    positive_example_indices)[0]
            else:
                these_positive_indices = numpy.array([], dtype=int)

            these_negative_indices = bootstrapping.draw_sample(
                negative_example_indices)[0]

            these_indices = numpy.concatenate(
                (these_positive_indices, these_negative_indices))

            if downsampling_dict is not None:
                these_subindices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels[these_indices],
                    num_examples_total=num_examples)

                these_indices = these_indices[these_subindices]

        if downsampling_dict is not None:
            this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                                return_counts=True)[-1]

            print('Number of examples by class: {0:s}'.format(
                str(this_num_ex_by_class)))

        this_evaluation_table = model_eval.run_evaluation(
            forecast_probabilities=forecast_probabilities[these_indices],
            observed_labels=observed_labels[these_indices],
            best_prob_threshold=best_prob_threshold,
            all_prob_thresholds=all_prob_thresholds,
            climatology=numpy.mean(observed_labels[these_indices]))

        list_of_evaluation_tables.append(this_evaluation_table)

        if i == num_bootstrap_reps - 1:
            print(SEPARATOR_STRING)
        else:
            print(MINOR_SEPARATOR_STRING)

        if i == 0:
            continue

        list_of_evaluation_tables[-1] = list_of_evaluation_tables[-1].align(
            list_of_evaluation_tables[0], axis=1)[0]

    evaluation_table = pandas.concat(list_of_evaluation_tables,
                                     axis=0,
                                     ignore_index=True)

    print('Writing results to: "{0:s}"...'.format(output_file_name))

    model_eval.write_evaluation(
        pickle_file_name=output_file_name,
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        best_prob_threshold=best_prob_threshold,
        all_prob_thresholds=all_prob_thresholds,
        num_examples_by_forecast_bin=num_examples_by_forecast_bin,
        downsampling_dict=downsampling_dict,
        evaluation_table=evaluation_table)
Example #4
0
def run_evaluation(forecast_probabilities, observed_labels, output_dir_name):
    """Evaluates forecast-observation pairs from any forecasting method.

    Specifically, this method does the following:

    - creates ROC (receiver operating characteristic) curve
    - creates performance diagram
    - creates attributes diagram
    - saves each of the aforelisted figures to a .jpg file
    - computes many performance metrics and saves them to a Pickle file

    :param forecast_probabilities: length-N numpy array of forecast event
        probabilities.
    :param observed_labels: length-N numpy array of observed labels (1 for
        "yes", 0 for "no").
    :param output_dir_name: Name of output directory.
    """

    file_system_utils.mkdir_recursive_if_necessary(
        directory_name=output_dir_name)

    # TODO(thunderhoser): Make binarization threshold an input argument to this
    # method.
    (binarization_threshold, best_csi
    ) = model_eval.find_best_binarization_threshold(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
        criterion_function=model_eval.get_csi,
        optimization_direction=model_eval.MAX_OPTIMIZATION_DIRECTION,
        unique_forecast_precision=FORECAST_PRECISION_FOR_THRESHOLDS)

    print (
        'Best binarization threshold = {0:.4f} ... corresponding CSI = {1:.4f}'
    ).format(binarization_threshold, best_csi)

    print 'Binarizing forecast probabilities...'
    forecast_labels = model_eval.binarize_forecast_probs(
        forecast_probabilities=forecast_probabilities,
        binarization_threshold=binarization_threshold)

    print 'Creating contingency table...'
    contingency_table_as_dict = model_eval.get_contingency_table(
        forecast_labels=forecast_labels, observed_labels=observed_labels)
    print '{0:s}\n'.format(str(contingency_table_as_dict))

    print 'Computing performance metrics...'
    pod = model_eval.get_pod(contingency_table_as_dict)
    pofd = model_eval.get_pofd(contingency_table_as_dict)
    success_ratio = model_eval.get_success_ratio(contingency_table_as_dict)
    focn = model_eval.get_focn(contingency_table_as_dict)
    accuracy = model_eval.get_accuracy(contingency_table_as_dict)
    csi = model_eval.get_csi(contingency_table_as_dict)
    frequency_bias = model_eval.get_frequency_bias(contingency_table_as_dict)
    peirce_score = model_eval.get_peirce_score(contingency_table_as_dict)
    heidke_score = model_eval.get_heidke_score(contingency_table_as_dict)

    print (
        'POD = {0:.4f} ... POFD = {1:.4f} ... success ratio = {2:.4f} ... '
        'FOCN = {3:.4f} ... accuracy = {4:.4f} ... CSI = {5:.4f} ... frequency '
        'bias = {6:.4f} ... Peirce score = {7:.4f} ... Heidke score = {8:.4f}\n'
    ).format(pod, pofd, success_ratio, focn, accuracy, csi, frequency_bias,
             peirce_score, heidke_score)

    auc, scikit_learn_auc = _create_roc_curve(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    bss_dict = _create_attributes_diagram(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    aupd = _create_performance_diagram(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels, output_dir_name=output_dir_name)
    print '\n'

    evaluation_file_name = '{0:s}/model_evaluation.p'.format(output_dir_name)
    print 'Writing results to: "{0:s}"...'.format(evaluation_file_name)
    model_eval.write_results(
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        binarization_threshold=binarization_threshold, pod=pod, pofd=pofd,
        success_ratio=success_ratio, focn=focn, accuracy=accuracy, csi=csi,
        frequency_bias=frequency_bias, peirce_score=peirce_score,
        heidke_score=heidke_score, auc=auc, scikit_learn_auc=scikit_learn_auc,
        aupd=aupd, bss_dict=bss_dict, pickle_file_name=evaluation_file_name)