def full_pipeline(model_type,
                  predicted_column,
                  grain_column,
                  impute=True,
                  verbose=True):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
    
    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column',
         hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation',
         hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)),
        ('null_row_filter',
         hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary',
         hcai_transformers.DataFrameConvertTargetToBinary(
             model_type, predicted_column)),
        ('prediction_to_numeric',
         hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables',
         hcai_transformers.DataFrameCreateDummyVariables(
             excluded_columns=[predicted_column])),
    ])
    return pipeline
Esempio n. 2
0
    def test_imputation_false_returns_unmodified(self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                           ['a', None, None]])
        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                                 ['a', None, None]])

        result = transformers.DataFrameImputer(impute=False).fit_transform(df)

        self.assertEqual(len(result), 4)
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
        self.assertTrue(expected.equals(result))
Esempio n. 3
0
    def test_imputation_removes_nones(self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                           [None, None, None]])
        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                                 ['b', 4 / 3.0, 5 / 3.0]])

        result = transformers.DataFrameImputer().fit_transform(df)
        self.assertEqual(len(result), 4)
        self.assertFalse(result.isnull().values.any())
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
        self.assertTrue(expected.equals(result))
Esempio n. 4
0
    def test_imputation_for_mean_of_numeric_and_mode_for_categorical(self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                           [None, None, None]])

        result = transformers.DataFrameImputer().fit_transform(df)

        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 2, 2],
                                 ['b', 4. / 3, 5. / 3]])

        self.assertEqual(len(result), 4)
        # Assert imputed values
        self.assertTrue(expected.equals(result))
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
Esempio n. 5
0
    def test_imputeStrategy_None_impute_for_None(self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8],
                           ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1],
                           ['b', 2, 7], [None, None, None]])
        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1],
                                 ['a', 2, 8], ['b', 2, 6], ['b', 1, 2],
                                 ['a', 6, 2], ['b', 3, 1], ['b', 2, 7],
                                 ['b', 22 / 9.0, 30 / 9.0]])

        result = transformers.DataFrameImputer(
            impute=True, imputeStrategy=None).fit_transform(df)

        self.assertEqual(len(result), 10)
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
        self.assertTrue(expected.equals(result))
Esempio n. 6
0
    def test_imputeStrategy_RandomForest_impute_for_NaN(self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8],
                           ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1],
                           ['b', 2, 7], [np.NaN, np.NaN, np.NaN]])
        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1],
                                 ['a', 2, 8], ['b', 2, 6], ['b', 1, 2],
                                 ['a', 6, 2], ['b', 3, 1], ['b', 2, 7],
                                 ['b', 1.567, 6.032]])

        result = transformers.DataFrameImputer(
            impute=True, imputeStrategy='RandomForest').fit_transform(df)
        result = round(result, 3)

        self.assertEqual(len(result), 10)
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
        self.assertTrue(expected.equals(result))
Esempio n. 7
0
    def test_imputation_false_and_imputeStrategy_RandomForest_returns_unmodified(
            self):
        df = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1], ['a', 2, 8],
                           ['b', 2, 6], ['b', 1, 2], ['a', 6, 2], ['b', 3, 1],
                           ['b', 2, 7], [None, None, None]])
        expected = pd.DataFrame([['a', 1, 2], ['b', 1, 1], ['b', 4, 1],
                                 ['a', 2, 8], ['b', 2, 6], ['b', 1, 2],
                                 ['a', 6, 2], ['b', 3, 1], ['b', 2, 7],
                                 [None, None, None]])

        result = transformers.DataFrameImputer(
            impute=False, imputeStrategy='RandomForest').fit_transform(df)

        self.assertEqual(len(result), 10)
        # Assert column types remain identical
        self.assertTrue(list(result.dtypes) == list(df.dtypes))
        self.assertTrue(expected.equals(result))