def test_p_percent_pandas(sensitive_classification_dataset): X, y = sensitive_classification_dataset mod_unfair = LogisticRegression().fit(X, y) assert p_percent_score("x2")(mod_unfair, X) == 0 mod_fair = make_pipeline(ColumnSelector("x1"), LogisticRegression()).fit(X, y) assert p_percent_score("x2")(mod_fair, X) == 0.9
def test_select_two(df): result_df = ColumnSelector(["d", "e"]).fit_transform(df) expected_df = pd.DataFrame( {"d": ["b", "a", "a", "b", "a", "b"], "e": [0, 1, 0, 1, 0, 1]} ) assert_frame_equal(result_df, expected_df)
def test_subset_score_pipeline(slicing_classification_dataset): X, y = slicing_classification_dataset model = make_pipeline( ColumnSelector("x1"), DummyClassifier(strategy="constant", constant=1) ).fit(X, y) accuracy_x1_0 = subset_score(lambda X, y_true: X["x1"] == 0, accuracy_score) assert accuracy_x1_0(estimator=model, X=X, y_true=y) == 0.25
def test_warning_is_logged(sensitive_classification_dataset): X, y = sensitive_classification_dataset mod_fair = make_pipeline(ColumnSelector("x1"), LogisticRegression()).fit(X, y) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # Trigger a warning. p_percent_score("x2", positive_target=2)(mod_fair, X) assert issubclass(w[-1].category, RuntimeWarning)
def test_p_percent_pandas_multiclass(sensitive_multiclass_classification_dataset): X, y = sensitive_multiclass_classification_dataset mod_unfair = LogisticRegression(multi_class="ovr").fit(X, y) assert p_percent_score("x2")(mod_unfair, X) == 0 assert p_percent_score("x2", positive_target=2)(mod_unfair, X) == 0 mod_fair = make_pipeline(ColumnSelector("x1"), LogisticRegression()).fit(X, y) assert p_percent_score("x2")(mod_fair, X) == pytest.approx(0.9333333) assert p_percent_score("x2", positive_target=2)(mod_fair, X) == 0
def test_select_not_in_frame(df): with pytest.raises(KeyError): ColumnSelector(['f']).fit_transform(df)
"""Module that contains the model configuration used in the training pipeline.""" from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklego.preprocessing import ColumnSelector from sklearn.compose import ColumnTransformer from xgboost import XGBRegressor from src.config import config RUN_NAME = "xgboost" #Prepare pipeline numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) categorical_transformer = OneHotEncoder(handle_unknown='ignore') column_selector = ColumnSelector(config.FEATURES) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, config.NUMERIC_FEATURES), ('cat', categorical_transformer, config.CATEGORICAL_FEATURES)]) #Create model xgb_model = XGBRegressor() model = Pipeline([('column_selector', column_selector), ("preprocessor", preprocessor), ("regressor", xgb_model)])
def test_select_all(df): result_df = ColumnSelector(["a", "b", "c", "d", "e"]).fit_transform(df) assert_frame_equal(result_df, df)
def _generate_features(self, X, y=None, numeric_extra=None, categorical_extra=None): try: self.feature_pipeline_ except AttributeError: n_days = X['dayofweek'].nunique() n_hours = X['hour'].nunique() self.feature_pipeline_ = Pipeline([( 'features', FeatureUnion([ # time of week part of TOWT ('weeks', Pipeline([ ('split', FeatureUnion([ ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')) ])), ('hours', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek', 'hour']))), ('term', PatsyTransformer('-1 + C(dayofweek):C(hour)')) ])) if (n_days > 1) and (n_hours > 1) else ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek']))), ('one_hot', OneHotEncoder(cols=['dayofweek'], return_df=False)) ])) if n_days > 1 else ('hours', Pipeline( [('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer( lambda x: pd.DataFrame(x, columns=['hour']))), ('one_hot', OneHotEncoder(cols=['hour'], return_df=False))])), # temperature part of TOWT ('temperature', ColumnTransformer([ ('encode_temperature', IntervalEncoder( n_chunks=10, span=0.1 * X[self.temperature_col].std(), method='normal'), [self.temperature_col]) ])), ('temperature_interact', 'drop' if n_hours == 1 else Pipeline( [('split', FeatureUnion([ ('temperature_part', Pipeline([ ('select', ColumnSelector(self.temperature_col)), ( 'create_bins', KBinsDiscretizer( n_bins=self.n_bins_temperature, strategy='quantile', encode='ordinal'), ) ])), ('hour_part', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=[self.temperature_col, 'hour']))), ('term', PatsyTransformer( f'-1 + C({self.temperature_col}):C(hour)'))])), # deal with extra numerical regressors ('numerical_regressors', 'drop' if not numeric_extra else ColumnTransformer( [(f'encode_{col}', IntervalEncoder(n_chunks=4, span=0.1 * X[col].std(), method='normal'), [col]) for col in numeric_extra])), # deal with extra categorical regressors ('categorical_regressors', 'drop' if not categorical_extra else TargetEncoder(cols=categorical_extra, return_df=False, handle_missing='value', handle_unknown='value')) ]))]) # Fit the pipeline self.feature_pipeline_.fit(X, y) finally: return self.feature_pipeline_.transform(X)
def create_features(df): return df.copy().assign(time_index=df.index) df = load_data().pipe(create_features) ############################################################## # Grouped predictor # Create separate model for separate groups. ############################################################## features = ["day", "mnth", "year", "season", "holiday", "weekday"] group_features = ["season"] regressor = GroupedPredictor(LinearRegression(), groups=group_features) pipeline = Pipeline([("grab_cols", ColumnSelector(features)), ("regression", regressor)]) pipeline.fit(df, df.rentals) y_hat = pipeline.predict(df) # Lets try to shuffle the features y_hat_shuffled_features = pipeline.predict( df[["holiday", "weekday", "day", "mnth", "year", "season"]]) assert all(y_hat == y_hat_shuffled_features ), "Shuffling feature order leads to different results." # Notice that not having a columns throws an error. pipeline.predict(df.drop("year", axis=1)) ##############################################################
def test_select_one_in_pipeline(df): pipe = make_pipeline(ColumnSelector(['d'])) result_df = pipe.fit_transform(df) expected_df = pd.DataFrame({"d": ["b", "a", "a", "b", "a", "b"]}) assert_frame_equal(result_df, expected_df)
def test_get_feature_names(): df = pd.DataFrame({'a': [4, 5, 6], 'b': ['4', '5', '6']}) transformer = ColumnSelector('a').fit(df) assert transformer.get_feature_names() == ['a']
all_data_pipe = Pipeline([ ('headfirst', utils.DFAllDataTransform()), ('cat2rare', DFCategoriesToRare(columns=['car_company', 'cylinders'], top_size=[14, 3])), ('col2bin', DFColumnToBins(columns=['model_year'], bins=[4])), ('cat2dummy', DFCategoricalEncoders(columns=['origin', 'cylinders', 'model_year', 'car_company'], encoding_type='label',)), ]) df = pd.read_csv(INPUT_PATH + 'mpg_clean.csv') df.drop('target', axis=1, inplace=True) df = all_data_pipe.fit_transform(df) df.to_csv(INPUT_PATH + "alldata_transform.csv", index=False) continous_cols = list(df.loc[:, 'displacement':'acceleration'].columns) cont_cols_selector = ColumnSelector(continous_cols) cont_cols_dropper = ColumnDropper(continous_cols) cont_cols_scaler = Pipeline([ ('std_scl', DFContinousScalers(columns=['displacement', 'weight'], scaling_type='std_scl')), ('robust_scl', DFContinousScalers(columns=['horsepower', 'acceleration'], scaling_type='robust')) ]) cat_cols_scaler = DFContinousScalers() ''' Pipelines ''' pipe_1 = Pipeline([ ('grab_cols', cont_cols_selector), ('scale_cols', cont_cols_scaler),
def test_select_none(df): with pytest.raises(ValueError): ColumnSelector([]).fit_transform(df)
def test_select_all(df): result_df = ColumnSelector(['a', 'b', 'c', 'd', 'e']).fit_transform(df) assert_frame_equal(result_df, df)
def test_select_one(df): result_df = ColumnSelector(['e']).fit_transform(df) expected_df = pd.DataFrame({"e": [0, 1, 0, 1, 0, 1]}) assert_frame_equal(result_df, expected_df)
def test_get_feature_names(): df = pd.DataFrame({"a": [4, 5, 6], "b": ["4", "5", "6"]}) transformer = ColumnSelector("a").fit(df) assert transformer.get_feature_names() == ["a"]