def test_design_matrix_in_pipeline(df): X, y = df[["a", "b", "c", "d"]], df[["e"]].values.ravel() pipe = Pipeline([ ("design", PatsyTransformer("a + np.log(a) + b - 1")), ("scale", StandardScaler()), ("model", LogisticRegression(solver='lbfgs')), ]) assert pipe.fit(X, y).predict(X).shape == (6,)
def test_design_matrix_error(df): df_train = df[:4] X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel() df_test = df[4:] X_test, _ = df_test[["a", "b", "c", "d"]], df_test[["e"]].values.ravel() pipe = Pipeline([ ("design", PatsyTransformer("a + np.log(a) + b + c + d - 1")), ("scale", StandardScaler()), ("model", LogisticRegression(solver='lbfgs')), ]) pipe.fit(X_train, y_train) with pytest.raises(RuntimeError): pipe.predict(X_test)
def test_subset_categories_in_test(df): df_train = df[:5] X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel() df_test = df[5:] X_test, _ = df_test[["a", "b", "c", "d"]], df_test[["e"]].values.ravel() trf = PatsyTransformer("a + np.log(a) + b + c + d - 1") trf.fit(X_train, y_train) assert trf.transform(X_test).shape[1] == trf.transform(X_train).shape[1]
def test_mult_usage(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a*b - 1") print(tf.fit(X, y).transform(X)) assert tf.fit(X, y).transform(X).shape == (6, 3)
def test_apply_numpy_transform(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + np.log(a) + b - 1") assert tf.fit(X, y).transform(X).shape == (6, 3)
def test_transform_dummy2(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + b + c + d") print(tf.fit(X, y).transform(X)) assert tf.fit(X, y).transform(X).shape == (6, 6)
def test_min_sign_usage(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + b - 1") assert tf.fit(X, y).transform(X).shape == (6, 2)
def test_basic_usage(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + b") assert tf.fit(X, y).transform(X).shape == (6, 3)
def _generate_features(self, X, y=None, numeric_extra=None, categorical_extra=None): try: self.feature_pipeline_ except AttributeError: n_days = X['dayofweek'].nunique() n_hours = X['hour'].nunique() self.feature_pipeline_ = Pipeline([( 'features', FeatureUnion([ # time of week part of TOWT ('weeks', Pipeline([ ('split', FeatureUnion([ ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')) ])), ('hours', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek', 'hour']))), ('term', PatsyTransformer('-1 + C(dayofweek):C(hour)')) ])) if (n_days > 1) and (n_hours > 1) else ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek']))), ('one_hot', OneHotEncoder(cols=['dayofweek'], return_df=False)) ])) if n_days > 1 else ('hours', Pipeline( [('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer( lambda x: pd.DataFrame(x, columns=['hour']))), ('one_hot', OneHotEncoder(cols=['hour'], return_df=False))])), # temperature part of TOWT ('temperature', ColumnTransformer([ ('encode_temperature', IntervalEncoder( n_chunks=10, span=0.1 * X[self.temperature_col].std(), method='normal'), [self.temperature_col]) ])), ('temperature_interact', 'drop' if n_hours == 1 else Pipeline( [('split', FeatureUnion([ ('temperature_part', Pipeline([ ('select', ColumnSelector(self.temperature_col)), ( 'create_bins', KBinsDiscretizer( n_bins=self.n_bins_temperature, strategy='quantile', encode='ordinal'), ) ])), ('hour_part', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=[self.temperature_col, 'hour']))), ('term', PatsyTransformer( f'-1 + C({self.temperature_col}):C(hour)'))])), # deal with extra numerical regressors ('numerical_regressors', 'drop' if not numeric_extra else ColumnTransformer( [(f'encode_{col}', IntervalEncoder(n_chunks=4, span=0.1 * X[col].std(), method='normal'), [col]) for col in numeric_extra])), # deal with extra categorical regressors ('categorical_regressors', 'drop' if not categorical_extra else TargetEncoder(cols=categorical_extra, return_df=False, handle_missing='value', handle_unknown='value')) ]))]) # Fit the pipeline self.feature_pipeline_.fit(X, y) finally: return self.feature_pipeline_.transform(X)
def test_return_type_dataframe(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + b - 1", return_type="dataframe") df_fit_transformed = tf.fit(X, y).transform(X) assert isinstance(df_fit_transformed, pd.DataFrame)
def test_return_type_dmatrix(df): X, y = df[["a", "b", "c", "d"]], df[["e"]] tf = PatsyTransformer("a + b - 1", return_type="matrix") # test for DesignMatrix this way as per https://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignMatrix df_fit_transformed = tf.fit(X, y).transform(X) assert hasattr(df_fit_transformed, "design_info")