コード例 #1
0
    def test_replace_missing_values_with_median(self):
        """Ensures correct output from replace_missing_values.

        In this case, missing values of feature F are replaced with the median
        F-value.
        """

        this_feature_table, this_replacement_dict = (
            feature_trans.replace_missing_values(
                FEATURE_TABLE,
                replacement_method=feature_trans.
                MEDIAN_VALUE_REPLACEMENT_METHOD))

        self.assertTrue(
            numpy.allclose(FEATURE_MATRIX_MISSING_TO_MEDIAN,
                           this_feature_table.to_numpy(),
                           atol=TOLERANCE))

        self.assertTrue(
            set(this_replacement_dict.keys()) == set(
                REPLACEMENT_DICT_MEDIAN.keys()))

        self.assertTrue(
            numpy.allclose(
                this_replacement_dict[feature_trans.ORIGINAL_MEDIANS_KEY],
                REPLACEMENT_DICT_MEDIAN[feature_trans.ORIGINAL_MEDIANS_KEY],
                atol=TOLERANCE))
コード例 #2
0
def _preprocess_data_for_learning(
        input_table, feature_names, learning_phase, replace_missing,
        standardize, transform_via_svd,
        fraction_of_explained_variance_for_svd=
        DEFAULT_EXP_VARIANCE_FRACTION_FOR_SVD,
        replacement_method=feature_trans.MEAN_VALUE_REPLACEMENT_METHOD,
        replacement_dict_for_training_data=None,
        standardization_dict_for_training_data=None,
        svd_dict_for_training_data=None):
    """Pre-processes data for input to any machine-learning algorithm.

    "Input" to a machine-learning algorithm means training, validation, or
    testing.  Data must be pre-processed in the same way for all three phases of
    learning.

    For training, `*dict_for_training_data` should all be left as None, because
    they will be computed on the fly.  However, for validation and testing,
    `*dict_for_training_data` should be the dictionaries created for training
    data.  In other words, these values are *not* computed on the fly for
    validation or testing.

    If transform_via_svd = True, data will be standardized and missing values
    will be replaced.  Thus, transform_via_svd = True implies that
    replace_missing = standardize = True.

    Similarly, if standardize = True, missing values will be replaced.  Thus,
    standardize = True implies that replace_missing = True.

    :param input_table: pandas DataFrame, where each row is one example (data
        point).
    :param feature_names: 1-D list with names of features (predictor variables).
        Each feature must be a column of input_table.
    :param learning_phase: Learning phase ("training", "validation", or
        "testing").
    :param replace_missing: Boolean flag.  If True, missing values of feature F
        will be replaced with the mean or median F-value.
    :param standardize: Boolean flag.  If True, each feature will be
        standardized to z-scores.
    :param transform_via_svd: Boolean flag.  If True, will transform features to
        empirical orthogonal functions (EOFs), using singular-value
        decomposition (SVD).
    :param fraction_of_explained_variance_for_svd:
        [used only if transform_via_svd = True]
        Determines number of modes (transformed features) to keep.  Will select
        modes in descending order of explained variance, until cumulative
        explained variance >= `fraction_of_explained_variance_for_svd` of
        variance in full dataset.
    :param replacement_method:
        [used only if replace_missing = True and learning_phase == "training"]
        See doc for `feature_transformation.repalce_missing_values`.
    :param replacement_dict_for_training_data:
        [used only if replace_missing = True and learning_phase != "training"]
        Dictionary created earlier for training data.  See doc for
        `feature_transformation.repalce_missing_values`.
    :param standardization_dict_for_training_data:
        [used only if standardize = True or transform_via_svd = True]
        If learning phase is "training", this will not be used (means and
        standard deviations are created on the fly, from the training data
        themselves).  Otherwise, this must be the dictionary created earlier for
        training data.
    :param svd_dict_for_training_data:
        [used only if transform_via_svd = True]
        If learning phase is "training", this will not be used (SVD parameters
        are created on the fly, from the training data themselves).  Otherwise,
        this must be the dictionary created earlier for training data.
    :return: transformed_input_table: Same as input_table, except that feature
        columns may have been transformed by standardization or SVD.  All non-
        feature columns are unchanged.  If SVD transformation was used,
        feature names (ergo, column names) are different.
    :return: transformed_feature_names: 1-D list with names of transformed
        features (predictor variables).  Each transformed feature is a column of
        transformed_input_table.
    :return: replacement_dict_for_training_data:
        [None if replace_missing = False]
        See documentation for `feature_transformation.replace_missing_values`.
        If learning phase is "training", this dictionary was just created on the
        fly.  Otherwise, this is merely the input dictionary.
    :return: standardization_dict_for_training_data:
        [None if standardize = transform_via_svd = False]
        See documentation for `feature_transformation.standardize_features`.  If
        learning phase is "training", this dictionary was just created on the
        fly.  Otherwise, this is merely the input dictionary.
    :return: svd_dict_for_training_data: [None if transform_via_svd = False]
        See documentation for `feature_transformation.perform_svd`.  If learning
        phase is "training", this dictionary was just created on the fly.
        Otherwise, this is merely the input dictionary.
    """

    error_checking.assert_is_boolean(replace_missing)
    error_checking.assert_is_boolean(standardize)
    error_checking.assert_is_boolean(transform_via_svd)

    # If no pre-processing, exit now.
    if not (replace_missing or standardize or transform_via_svd):
        return (input_table, feature_names, replacement_dict_for_training_data,
                standardization_dict_for_training_data,
                svd_dict_for_training_data)

    _check_learning_phase(learning_phase)
    _check_input_data_for_learning(
        input_table=input_table, feature_names=feature_names, target_name=None)

    if transform_via_svd:
        if learning_phase == TRAINING_PHASE:
            (standardization_dict_for_training_data,
             svd_dict_for_training_data) = feature_trans.perform_svd(
                 input_table[feature_names])
            svd_dict_for_training_data = (
                feature_trans.filter_svd_by_explained_variance(
                    svd_dict_for_training_data, fraction_of_variance_to_keep=
                    fraction_of_explained_variance_for_svd))

        transformed_input_table = pandas.DataFrame(
            feature_trans.transform_features_via_svd(
                feature_table=input_table[feature_names],
                standardization_dict=standardization_dict_for_training_data,
                svd_dictionary=svd_dict_for_training_data))
        transformed_input_table = _rename_svd_transformed_features(
            transformed_input_table)

    elif standardize:
        if learning_phase == TRAINING_PHASE:
            transformed_input_table, standardization_dict_for_training_data = (
                feature_trans.standardize_features(
                    feature_table=input_table[feature_names],
                    standardization_dict=None))
        else:
            transformed_input_table, _ = feature_trans.standardize_features(
                feature_table=input_table[feature_names],
                standardization_dict=standardization_dict_for_training_data)

    elif replace_missing:
        if learning_phase == TRAINING_PHASE:
            transformed_input_table, replacement_dict_for_training_data = (
                feature_trans.replace_missing_values(
                    feature_table=input_table[feature_names],
                    replacement_method=replacement_method,
                    replacement_dict=None))
        else:
            transformed_input_table, _ = feature_trans.replace_missing_values(
                feature_table=input_table[feature_names],
                replacement_dict=replacement_dict_for_training_data)

    transformed_feature_names = list(transformed_input_table)
    non_feature_columns = [
        s for s in list(input_table) if s not in feature_names]
    for this_column in non_feature_columns:
        transformed_input_table = transformed_input_table.assign(
            **{this_column: input_table[this_column].values})

    if transform_via_svd:
        return (transformed_input_table, transformed_feature_names, None,
                standardization_dict_for_training_data,
                svd_dict_for_training_data)

    if standardize:
        return (transformed_input_table, transformed_feature_names, None,
                standardization_dict_for_training_data, None)

    return (transformed_input_table, transformed_feature_names,
            replacement_dict_for_training_data, None, None)