Exemple #1
0
    def test_sample_by_class_wind(self):
        """Ensures correct output from sample_by_class.

        In this case, target variable = wind-speed category.
        """

        these_indices = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=
            SAMPLING_FRACTION_BY_WIND_4CLASS_DICT,
            target_name=WIND_TARGET_NAME_4CLASSES,
            target_values=WIND_LABELS_TO_SAMPLE,
            num_examples_total=NUM_EXAMPLES_TOTAL,
            test_mode=True)

        self.assertTrue(numpy.array_equal(these_indices, WIND_INDICES_TO_KEEP))
    def test_sample_by_class_tornado(self):
        """Ensures correct output from sample_by_class.

        In this case, target variable = tornado occurrence.
        """

        these_indices = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=
            BALANCED_FRACTION_BY_TORNADO_CLASS_DICT,
            target_name=TORNADO_TARGET_NAME,
            target_values=TORNADO_LABELS_TO_SAMPLE,
            num_examples_total=NUM_EXAMPLES_TOTAL,
            test_mode=True)

        self.assertTrue(
            numpy.array_equal(these_indices, TORNADO_INDICES_TO_KEEP))
def _downsampling_base(storm_ids,
                       storm_times_unix_sec,
                       target_values,
                       target_name,
                       class_fraction_dict,
                       test_mode=False):
    """Base for `downsample_for_training` and `downsample_for_non_training`.

    The procedure is described below.

    [1] Find all storm objects in the highest class (e.g., tornadic).  Call this
        set {s_highest}.
    [2] Find all storm cells with at least one object in {s_highest}.  Call this
        set {S_highest}.
    [3] Find all time steps with at least one storm cell in {S_highest}.  Call
        this set {t_highest}.
    [4] Randomly remove a large fraction of time steps NOT in {t_highest}.
    [5] Downsample remaining storm objects, leaving a prescribed fraction in
        each class (according to `class_fraction_dict`).

    N = number of storm objects before downsampling
    K = number of storm objects after intermediate downsampling
    n = number of storm objects after final downsampling

    :param storm_ids: length-N list of storm IDs (strings).
    :param storm_times_unix_sec: length-N numpy array of corresponding times.
    :param target_values: length-N numpy array of corresponding target values
        (integer class labels).
    :param target_name: Name of target variable (must be accepted by
        `target_val_utils.target_name_to_params`).
    :param class_fraction_dict: Dictionary, where each key is an integer class
        label (-2 for "dead storm") and the corresponding value is the
        sampling fraction.
    :param test_mode: Never mind.  Just leave this alone.
    :return: storm_ids: length-K list of storm IDs (strings).
    :return: storm_times_unix_sec: length-K numpy array of corresponding times.
    :return: target_values: length-K numpy array of corresponding target values.
    :return: indices_to_keep: length-n numpy array of indices to keep.  These
        are indices are into the output arrays `storm_ids`,
        `storm_times_unix_sec`, and `target_values`.
    """

    _report_class_fractions(target_values)
    error_checking.assert_is_boolean(test_mode)

    num_storm_objects = len(storm_ids)
    num_classes = target_val_utils.target_name_to_num_classes(
        target_name=target_name, include_dead_storms=False)

    # Step 1.
    print(
        'Finding storm objects in class {0:d} (the highest class), yielding set'
        ' {{s_highest}}...').format(num_classes - 1)

    highest_class_indices = numpy.where(target_values == num_classes - 1)[0]

    print '{{s_highest}} contains {0:d} of {1:d} storm objects.'.format(
        len(highest_class_indices), num_storm_objects)

    # Step 2.
    print(
        'Finding storm cells with at least one object in {s_highest}, '
        'yielding set {S_highest}...')
    highest_class_indices = _find_storm_cells(
        storm_id_by_object=storm_ids,
        desired_storm_cell_ids=[storm_ids[k] for k in highest_class_indices])

    print '{{S_highest}} contains {0:d} of {1:d} storm objects.'.format(
        len(highest_class_indices), num_storm_objects)

    # Step 3.
    print(
        'Finding all time steps with at least one storm cell in '
        '{S_highest}, yielding set {t_highest}...')

    lower_class_times_unix_sec = (
        set(storm_times_unix_sec.tolist()) -
        set(storm_times_unix_sec[highest_class_indices].tolist()))
    lower_class_times_unix_sec = numpy.array(list(lower_class_times_unix_sec),
                                             dtype=int)

    # Step 4.
    print 'Randomly removing {0:.1f}% of times not in {{t_highest}}...'.format(
        FRACTION_UNINTERESTING_TIMES_TO_OMIT * 100)

    this_num_times = int(
        numpy.round(FRACTION_UNINTERESTING_TIMES_TO_OMIT *
                    len(lower_class_times_unix_sec)))

    if test_mode:
        times_to_remove_unix_sec = lower_class_times_unix_sec[:this_num_times]
    else:
        times_to_remove_unix_sec = numpy.random.choice(
            lower_class_times_unix_sec, size=this_num_times, replace=False)

    indices_to_keep = _find_uncovered_times(
        all_times_unix_sec=storm_times_unix_sec,
        covered_times_unix_sec=times_to_remove_unix_sec)

    storm_ids = [storm_ids[k] for k in indices_to_keep]
    storm_times_unix_sec = storm_times_unix_sec[indices_to_keep]
    target_values = target_values[indices_to_keep]

    _report_class_fractions(target_values)

    # Step 5.
    print 'Downsampling storm objects from remaining times...'
    indices_to_keep = dl_utils.sample_by_class(
        sampling_fraction_by_class_dict=class_fraction_dict,
        target_name=target_name,
        target_values=target_values,
        num_examples_total=LARGE_INTEGER,
        test_mode=test_mode)

    return storm_ids, storm_times_unix_sec, target_values, indices_to_keep
def _compute_scores(forecast_probabilities,
                    observed_labels,
                    num_bootstrap_reps,
                    output_file_name,
                    best_prob_threshold=None,
                    downsampling_dict=None):
    """Computes evaluation scores.

    E = number of examples (storm objects)

    :param forecast_probabilities: length-E numpy array of forecast event
        probabilities.
    :param observed_labels: length-E numpy array of observations (1 for event,
        0 for non-event).
    :param num_bootstrap_reps: Number of bootstrap replicates.
    :param output_file_name: Path to output file (will be written by
        `model_evaluation.write_evaluation`).
    :param best_prob_threshold: Best probability threshold.  If None, will be
        determined on the fly.
    :param downsampling_dict: Dictionary with downsampling fractions.  See doc
        for `deep_learning_utils.sample_by_class`.  If this is None,
        downsampling will not be used.
    """

    num_examples = len(observed_labels)
    num_examples_by_class = numpy.unique(observed_labels,
                                         return_counts=True)[-1]

    print('Number of examples by class (no downsampling): {0:s}'.format(
        str(num_examples_by_class)))

    positive_example_indices = numpy.where(observed_labels == 1)[0]
    negative_example_indices = numpy.where(observed_labels == 0)[0]

    if downsampling_dict is None:
        these_indices = numpy.linspace(0,
                                       num_examples - 1,
                                       num=num_examples,
                                       dtype=int)
    else:
        these_indices = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=downsampling_dict,
            target_name=DUMMY_TARGET_NAME,
            target_values=observed_labels,
            num_examples_total=num_examples)

        this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                            return_counts=True)[-1]

        print('Number of examples by class (after downsampling): {0:s}'.format(
            str(this_num_ex_by_class)))

    all_prob_thresholds = model_eval.get_binarization_thresholds(
        threshold_arg=model_eval.THRESHOLD_ARG_FOR_UNIQUE_FORECASTS,
        forecast_probabilities=forecast_probabilities[these_indices],
        forecast_precision=FORECAST_PRECISION)

    if best_prob_threshold is None:
        best_prob_threshold, best_csi = (
            model_eval.find_best_binarization_threshold(
                forecast_probabilities=forecast_probabilities[these_indices],
                observed_labels=observed_labels[these_indices],
                threshold_arg=all_prob_thresholds,
                criterion_function=model_eval.get_csi,
                optimization_direction=model_eval.MAX_OPTIMIZATION_STRING))
    else:
        these_forecast_labels = model_eval.binarize_forecast_probs(
            forecast_probabilities=forecast_probabilities[these_indices],
            binarization_threshold=best_prob_threshold)

        this_contingency_dict = model_eval.get_contingency_table(
            forecast_labels=these_forecast_labels,
            observed_labels=observed_labels[these_indices])

        best_csi = model_eval.get_csi(this_contingency_dict)

    print(
        ('Best probability threshold = {0:.4f} ... corresponding CSI = {1:.4f}'
         ).format(best_prob_threshold, best_csi))

    num_examples_by_forecast_bin = model_eval.get_points_in_reliability_curve(
        forecast_probabilities=forecast_probabilities[these_indices],
        observed_labels=observed_labels[these_indices],
        num_forecast_bins=model_eval.DEFAULT_NUM_RELIABILITY_BINS)[-1]

    list_of_evaluation_tables = []

    for i in range(num_bootstrap_reps):
        print(('Computing scores for {0:d}th of {1:d} bootstrap replicates...'
               ).format(i + 1, num_bootstrap_reps))

        if num_bootstrap_reps == 1:
            if downsampling_dict is None:
                these_indices = numpy.linspace(0,
                                               num_examples - 1,
                                               num=num_examples,
                                               dtype=int)
            else:
                these_indices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels,
                    num_examples_total=num_examples)
        else:
            if len(positive_example_indices) > 0:
                these_positive_indices = bootstrapping.draw_sample(
                    positive_example_indices)[0]
            else:
                these_positive_indices = numpy.array([], dtype=int)

            these_negative_indices = bootstrapping.draw_sample(
                negative_example_indices)[0]

            these_indices = numpy.concatenate(
                (these_positive_indices, these_negative_indices))

            if downsampling_dict is not None:
                these_subindices = dl_utils.sample_by_class(
                    sampling_fraction_by_class_dict=downsampling_dict,
                    target_name=DUMMY_TARGET_NAME,
                    target_values=observed_labels[these_indices],
                    num_examples_total=num_examples)

                these_indices = these_indices[these_subindices]

        if downsampling_dict is not None:
            this_num_ex_by_class = numpy.unique(observed_labels[these_indices],
                                                return_counts=True)[-1]

            print('Number of examples by class: {0:s}'.format(
                str(this_num_ex_by_class)))

        this_evaluation_table = model_eval.run_evaluation(
            forecast_probabilities=forecast_probabilities[these_indices],
            observed_labels=observed_labels[these_indices],
            best_prob_threshold=best_prob_threshold,
            all_prob_thresholds=all_prob_thresholds,
            climatology=numpy.mean(observed_labels[these_indices]))

        list_of_evaluation_tables.append(this_evaluation_table)

        if i == num_bootstrap_reps - 1:
            print(SEPARATOR_STRING)
        else:
            print(MINOR_SEPARATOR_STRING)

        if i == 0:
            continue

        list_of_evaluation_tables[-1] = list_of_evaluation_tables[-1].align(
            list_of_evaluation_tables[0], axis=1)[0]

    evaluation_table = pandas.concat(list_of_evaluation_tables,
                                     axis=0,
                                     ignore_index=True)

    print('Writing results to: "{0:s}"...'.format(output_file_name))

    model_eval.write_evaluation(
        pickle_file_name=output_file_name,
        forecast_probabilities=forecast_probabilities,
        observed_labels=observed_labels,
        best_prob_threshold=best_prob_threshold,
        all_prob_thresholds=all_prob_thresholds,
        num_examples_by_forecast_bin=num_examples_by_forecast_bin,
        downsampling_dict=downsampling_dict,
        evaluation_table=evaluation_table)
Exemple #5
0
def _find_examples_to_read(option_dict, num_examples_total):
    """Determines which examples to read.

    E = number of examples to read

    :param option_dict: See doc for any generator in this file.
    :param num_examples_total: Number of examples to generate.
    :return: storm_ids: length-E list of storm IDs (strings).
    :return: storm_times_unix_sec: length-E numpy array of storm times.
    """

    error_checking.assert_is_integer(num_examples_total)
    error_checking.assert_is_greater(num_examples_total, 0)

    example_file_names = option_dict[trainval_io.EXAMPLE_FILES_KEY]

    radar_field_names = option_dict[trainval_io.RADAR_FIELDS_KEY]
    radar_heights_m_agl = option_dict[trainval_io.RADAR_HEIGHTS_KEY]
    first_storm_time_unix_sec = option_dict[trainval_io.FIRST_STORM_TIME_KEY]
    last_storm_time_unix_sec = option_dict[trainval_io.LAST_STORM_TIME_KEY]
    num_grid_rows = option_dict[trainval_io.NUM_ROWS_KEY]
    num_grid_columns = option_dict[trainval_io.NUM_COLUMNS_KEY]

    class_to_sampling_fraction_dict = option_dict[
        trainval_io.SAMPLING_FRACTIONS_KEY]

    storm_ids = []
    storm_times_unix_sec = numpy.array([], dtype=int)
    target_values = numpy.array([], dtype=int)

    target_name = None
    num_files = len(example_file_names)

    for i in range(num_files):
        print 'Reading target values from: "{0:s}"...'.format(
            example_file_names[i])

        this_example_dict = input_examples.read_example_file(
            netcdf_file_name=example_file_names[i], include_soundings=False,
            radar_field_names_to_keep=[radar_field_names[0]],
            radar_heights_to_keep_m_agl=radar_heights_m_agl[[0]],
            first_time_to_keep_unix_sec=first_storm_time_unix_sec,
            last_time_to_keep_unix_sec=last_storm_time_unix_sec,
            num_rows_to_keep=num_grid_rows,
            num_columns_to_keep=num_grid_columns)

        target_name = this_example_dict[input_examples.TARGET_NAME_KEY]

        storm_ids += this_example_dict[input_examples.STORM_IDS_KEY]
        storm_times_unix_sec = numpy.concatenate((
            storm_times_unix_sec,
            this_example_dict[input_examples.STORM_TIMES_KEY]
        ))
        target_values = numpy.concatenate((
            target_values, this_example_dict[input_examples.TARGET_VALUES_KEY]
        ))

    indices_to_keep = numpy.where(
        target_values != target_val_utils.INVALID_STORM_INTEGER
    )[0]

    storm_ids = [storm_ids[k] for k in indices_to_keep]
    storm_times_unix_sec = storm_times_unix_sec[indices_to_keep]
    target_values = target_values[indices_to_keep]
    num_examples_found = len(storm_ids)

    if class_to_sampling_fraction_dict is None:
        indices_to_keep = numpy.linspace(
            0, num_examples_found - 1, num=num_examples_found, dtype=int)

        if num_examples_found > num_examples_total:
            indices_to_keep = numpy.random.choice(
                indices_to_keep, size=num_examples_total, replace=False)
    else:
        indices_to_keep = dl_utils.sample_by_class(
            sampling_fraction_by_class_dict=class_to_sampling_fraction_dict,
            target_name=target_name, target_values=target_values,
            num_examples_total=num_examples_total)

    storm_ids = [storm_ids[k] for k in indices_to_keep]
    storm_times_unix_sec = storm_times_unix_sec[indices_to_keep]

    return storm_ids, storm_times_unix_sec