Example #1
0
def test_design_matrix_in_pipeline(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]].values.ravel()
    pipe = Pipeline([
        ("design", PatsyTransformer("a + np.log(a) + b - 1")),
        ("scale", StandardScaler()),
        ("model", LogisticRegression(solver='lbfgs')),
    ])
    assert pipe.fit(X, y).predict(X).shape == (6,)
Example #2
0
def test_design_matrix_error(df):
    df_train = df[:4]
    X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()

    df_test = df[4:]
    X_test, _ = df_test[["a", "b", "c", "d"]], df_test[["e"]].values.ravel()

    pipe = Pipeline([
        ("design", PatsyTransformer("a + np.log(a) + b + c + d - 1")),
        ("scale", StandardScaler()),
        ("model", LogisticRegression(solver='lbfgs')),
    ])

    pipe.fit(X_train, y_train)
    with pytest.raises(RuntimeError):
        pipe.predict(X_test)
Example #3
0
def test_subset_categories_in_test(df):
    df_train = df[:5]
    X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()

    df_test = df[5:]
    X_test, _ = df_test[["a", "b", "c", "d"]], df_test[["e"]].values.ravel()

    trf = PatsyTransformer("a + np.log(a) + b + c + d - 1")

    trf.fit(X_train, y_train)

    assert trf.transform(X_test).shape[1] == trf.transform(X_train).shape[1]
Example #4
0
def test_mult_usage(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a*b - 1")
    print(tf.fit(X, y).transform(X))
    assert tf.fit(X, y).transform(X).shape == (6, 3)
Example #5
0
def test_apply_numpy_transform(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + np.log(a) + b - 1")
    assert tf.fit(X, y).transform(X).shape == (6, 3)
Example #6
0
def test_transform_dummy2(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + b + c + d")
    print(tf.fit(X, y).transform(X))
    assert tf.fit(X, y).transform(X).shape == (6, 6)
Example #7
0
def test_min_sign_usage(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + b - 1")
    assert tf.fit(X, y).transform(X).shape == (6, 2)
Example #8
0
def test_basic_usage(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + b")
    assert tf.fit(X, y).transform(X).shape == (6, 3)
Example #9
0
    def _generate_features(self,
                           X,
                           y=None,
                           numeric_extra=None,
                           categorical_extra=None):
        try:
            self.feature_pipeline_

        except AttributeError:
            n_days = X['dayofweek'].nunique()
            n_hours = X['hour'].nunique()

            self.feature_pipeline_ = Pipeline([(
                'features',
                FeatureUnion([
                    # time of week part of TOWT
                    ('weeks',
                     Pipeline([
                         ('split',
                          FeatureUnion([
                              ('days',
                               Pipeline([
                                   ('select', ColumnSelector('dayofweek')),
                                   ('ordinal',
                                    OrdinalEncoder(cols=['dayofweek'],
                                                   return_df=False)),
                                   ('unknown',
                                    SimpleImputer(missing_values=-1,
                                                  strategy='most_frequent'))
                               ])),
                              ('hours',
                               Pipeline([('select', ColumnSelector('hour')),
                                         ('ordinal',
                                          OrdinalEncoder(cols=['hour'],
                                                         return_df=False)),
                                         ('unknown',
                                          SimpleImputer(
                                              missing_values=-1,
                                              strategy='most_frequent'))]))
                          ])),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek', 'hour']))),
                         ('term',
                          PatsyTransformer('-1 + C(dayofweek):C(hour)'))
                     ])) if (n_days > 1) and (n_hours > 1) else
                    ('days',
                     Pipeline([
                         ('select', ColumnSelector('dayofweek')),
                         ('ordinal',
                          OrdinalEncoder(cols=['dayofweek'], return_df=False)),
                         ('unknown',
                          SimpleImputer(missing_values=-1,
                                        strategy='most_frequent')),
                         ('to_pandas',
                          FunctionTransformer(lambda x: pd.DataFrame(
                              x, columns=['dayofweek']))),
                         ('one_hot',
                          OneHotEncoder(cols=['dayofweek'], return_df=False))
                     ])) if n_days > 1 else
                    ('hours',
                     Pipeline(
                         [('select', ColumnSelector('hour')),
                          ('ordinal',
                           OrdinalEncoder(cols=['hour'], return_df=False)),
                          ('unknown',
                           SimpleImputer(missing_values=-1,
                                         strategy='most_frequent')),
                          ('to_pandas',
                           FunctionTransformer(
                               lambda x: pd.DataFrame(x, columns=['hour']))),
                          ('one_hot',
                           OneHotEncoder(cols=['hour'], return_df=False))])),

                    # temperature part of TOWT
                    ('temperature',
                     ColumnTransformer([
                         ('encode_temperature',
                          IntervalEncoder(
                              n_chunks=10,
                              span=0.1 * X[self.temperature_col].std(),
                              method='normal'), [self.temperature_col])
                     ])),
                    ('temperature_interact',
                     'drop' if n_hours == 1 else Pipeline(
                         [('split',
                           FeatureUnion([
                               ('temperature_part',
                                Pipeline([
                                    ('select',
                                     ColumnSelector(self.temperature_col)),
                                    (
                                        'create_bins',
                                        KBinsDiscretizer(
                                            n_bins=self.n_bins_temperature,
                                            strategy='quantile',
                                            encode='ordinal'),
                                    )
                                ])),
                               ('hour_part',
                                Pipeline([('select', ColumnSelector('hour')),
                                          ('ordinal',
                                           OrdinalEncoder(cols=['hour'],
                                                          return_df=False)),
                                          ('unknown',
                                           SimpleImputer(
                                               missing_values=-1,
                                               strategy='most_frequent'))]))
                           ])),
                          ('to_pandas',
                           FunctionTransformer(lambda x: pd.DataFrame(
                               x, columns=[self.temperature_col, 'hour']))),
                          ('term',
                           PatsyTransformer(
                               f'-1 + C({self.temperature_col}):C(hour)'))])),

                    # deal with extra numerical regressors
                    ('numerical_regressors',
                     'drop' if not numeric_extra else ColumnTransformer(
                         [(f'encode_{col}',
                           IntervalEncoder(n_chunks=4,
                                           span=0.1 * X[col].std(),
                                           method='normal'), [col])
                          for col in numeric_extra])),

                    # deal with extra categorical regressors
                    ('categorical_regressors', 'drop' if not categorical_extra
                     else TargetEncoder(cols=categorical_extra,
                                        return_df=False,
                                        handle_missing='value',
                                        handle_unknown='value'))
                ]))])
            # Fit the pipeline
            self.feature_pipeline_.fit(X, y)

        finally:
            return self.feature_pipeline_.transform(X)
Example #10
0
def test_return_type_dataframe(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + b - 1", return_type="dataframe")
    df_fit_transformed = tf.fit(X, y).transform(X)
    assert isinstance(df_fit_transformed, pd.DataFrame)
Example #11
0
def test_return_type_dmatrix(df):
    X, y = df[["a", "b", "c", "d"]], df[["e"]]
    tf = PatsyTransformer("a + b - 1", return_type="matrix")
    # test for DesignMatrix this way as per https://patsy.readthedocs.io/en/latest/API-reference.html#patsy.DesignMatrix
    df_fit_transformed = tf.fit(X, y).transform(X)
    assert hasattr(df_fit_transformed, "design_info")