def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True): """ Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data. Note advanced users may wish to use their own custom pipeline. """ # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for # inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays. pipeline = Pipeline([ ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()), ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)), # Perform one of two basic imputation methods # TODO we need to think about making this optional to solve the problem of rare and very predictive values ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose)), ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)), ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary( model_type, predicted_column)), ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)), ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables( excluded_columns=[predicted_column])), ]) return pipeline
def test_raises_error_on_non_dataframe_inputs(self): junk_inputs = [ 'foo', 42, [1, 2, 3], [[1, 2, 3], [1, 2, 3], [1, 2, 3], ], {'a': 1} ] for junk in junk_inputs: self.assertRaises(HealthcareAIError, filters.DataframeColumnRemover(None).fit_transform, junk)
def test_removes_nothing_when_it_finds_no_matches(self): df = pd.DataFrame({ 'category': ['a', 'b', 'c'], 'gender': ['F', 'M', 'F'], 'age': [1, 5, 4] }) result = filters.DataframeColumnRemover('PatientID').fit_transform(df) self.assertEqual(len(result.columns), 3) self.assertEqual(list(result.columns).sort(), list(df.columns).sort())
def test_removes_list(self): df = pd.DataFrame({ 'category': ['a', 'b', 'c'], 'gender': ['F', 'M', 'F'], 'patientid': [1, 5, 4] }) result = filters.DataframeColumnRemover(['gender', 'patientid', 'foo']).fit_transform(df) expected = ['category'] self.assertEqual(len(result.columns), 1) self.assertEqual(list(result.columns).sort(), expected.sort())
def test_does_not_remove_lowercase_match(self): df = pd.DataFrame({ 'category': ['a', 'b', 'c'], 'gender': ['F', 'M', 'F'], 'patientid': [1, 5, 4] }) result = filters.DataframeColumnRemover('PatientID').fit_transform(df) expected = ['category', 'gender', 'patientid'] self.assertEqual(len(result.columns), 3) self.assertEqual(list(result.columns).sort(), expected.sort())