Example #1
0
def fit_model(X, y, classifier_settings={}, fit_settings={}):
    strategy = classifier_settings.get("numerical_impute_strategy")
    if strategy is None:
        raise Exception("Missing impute strategy for numerical features")

    numeric_transformer = Pipeline(steps=[("num_impute",
                                           SimpleImputer(strategy=strategy))])

    strategy = classifier_settings.get("categorical_impute_strategy")
    if strategy is None:
        raise Exception("Missing impute strategy for categorical features")

    categorical_transformer = Pipeline(steps=[
        ("cat_impute", SimpleImputer(strategy=strategy)),
        ("ohe", OneHotEncoder(drop="if_binary", handle_unknown="error")),
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer,
         make_column_selector(dtype_include=np.number)),
        (
            "cat",
            categorical_transformer,
            make_column_selector(dtype_exclude=np.number),
        ),
    ])

    clf = Pipeline(steps=[("preprocessor",
                           preprocessor), ("classifier",
                                           LogisticRegression())])

    clf.fit(X, y)

    return clf
Example #2
0
def fit_model(X, y, classifier_settings={}, fit_settings={}):
    numeric_transformer = Pipeline(steps=[("identity", IdentityTransformer())])

    categorical_transformer = Pipeline(steps=[
        ("ohe", OneHotEncoder(drop="if_binary", handle_unknown="error")),
    ])

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer,
         make_column_selector(dtype_include=np.number)),
        (
            "cat",
            categorical_transformer,
            make_column_selector(dtype_exclude=np.number),
        ),
    ])

    xgb_clf = xgb.sklearn.XGBClassifier(**classifier_settings)

    clf = Pipeline(steps=[("preprocessor",
                           preprocessor), ("classifier", xgb_clf)])

    clf.fit(X, y, **fit_settings)

    return clf
Example #3
0
def test_ColumnTransformer_with_selector():
    expected = pd.DataFrame({
        "name": [
            "hotel",
            "hotel",
            "meal",
            "meal",
            "meal",
            "lead_time",
            "average_daily_rate",
        ],
        "feature": [
            "x0_City_Hotel",
            "x0_Resort_Hotel",
            "x1_BB",
            "x1_HB",
            "x1_SC",
            "lead_time",
            "average_daily_rate",
        ],
    })

    preprocess = make_column_transformer(
        (OneHotEncoder(sparse=False),
         make_column_selector(dtype_include=object)),
        (StandardScaler(), numeric,
         make_column_selector(dtype_exclude=object)),
    )

    preprocess.fit(X)

    assert feat(preprocess, X.columns).equals(expected)
Example #4
0
    def build_pipeline(X_train):
        categorical_values = []

        cat_subset = X_train.select_dtypes(
            include=['object', 'category', 'bool'])

        for i in range(cat_subset.shape[1]):
            categorical_values.append(
                list(cat_subset.iloc[:, i].dropna().unique()))

        date_pipeline = Pipeline([('dateFeatures', process.DateTransform())])

        num_pipeline = Pipeline([('cleaner', SimpleImputer()),
                                 ('scaler', StandardScaler())])

        cat_pipeline = Pipeline([
            ('cleaner', SimpleImputer(strategy='most_frequent')),
            ('encoder',
             OneHotEncoder(sparse=False, categories=categorical_values))
        ])

        preprocessor = ColumnTransformer([
            ('numerical', num_pipeline,
             make_column_selector(
                 dtype_exclude=['object', 'category', 'bool'])),
            ('categorical', cat_pipeline,
             make_column_selector(
                 dtype_include=['object', 'category', 'bool']))
        ])

        return preprocessor
def test_column_transformer_with_make_column_selector():
    # Functional test for column transformer + column selector
    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(
        {
            'col_int': np.array([0, 1, 2], dtype=np.int),
            'col_float': np.array([0.0, 1.0, 2.0], dtype=np.float),
            'col_cat': ["one", "two", "one"],
            'col_str': ["low", "middle", "high"]
        },
        columns=['col_int', 'col_float', 'col_cat', 'col_str'])
    X_df['col_str'] = X_df['col_str'].astype('category')

    cat_selector = make_column_selector(dtype_include=['category', object])
    num_selector = make_column_selector(dtype_include=np.number)

    ohe = OneHotEncoder()
    scaler = StandardScaler()

    ct_selector = make_column_transformer((ohe, cat_selector),
                                          (scaler, num_selector))
    ct_direct = make_column_transformer((ohe, ['col_cat', 'col_str']),
                                        (scaler, ['col_float', 'col_int']))

    X_selector = ct_selector.fit_transform(X_df)
    X_direct = ct_direct.fit_transform(X_df)

    assert_allclose(X_selector, X_direct)
Example #6
0
    def set_pipeline(self):

        self.pipeline = self.kwargs.get("pipeline", None)

        # Create a temp folder
        cachedir = mkdtemp()

        # Pipeline structure
        num_transformer = MinMaxScaler()
        cat_transformer = OneHotEncoder(handle_unknown = 'ignore')
        feateng_blocks = [
            ("num_transformer", num_transformer, make_column_selector(dtype_include = ['int', 'float'])),
            ("cat_transformer", cat_transformer, make_column_selector(dtype_include = ['object', 'bool']))
        ]               
        features_encoder = columnTransformer(feateng_blocks,
                                             n_jobs = None,
                                             remainder = "drop"
                                            )

        # Combine preprocessing and model:
        self.pipeline = Pipeline(steps = [
            ('features', features_encoder),
            ('model', self.get_estimator())
        ],
        memory = cachedir # Avoid recalculating transformer variables during cross validations or grid searches
        )

        # Clear the cache directory after the cross-validation
        rmtree(cachedir)
Example #7
0
def features1():
    return [
        ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)),
        ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)),
        (
            "moving_average_3",
            MovingAverage(window_size=3),
            make_column_selector(dtype_include=np.number),
        ),
    ]
Example #8
0
def columns_transform():
    return make_column_transformer(
        (
            StandardScaler(),
            make_column_selector("^(?!crashYear)", dtype_include=np.number),
        ),
        (
            OneHotEncoder(handle_unknown="ignore"),
            make_column_selector(dtype_include=object),
        ),
    )
Example #9
0
def feature_selection(X: pd.DataFrame, y: pd.DataFrame):
    fe_column_transformer = ColumnTransformer(
        transformers=[('numeric', SelectKBest(score_func=f_classif, k="all"),
                       make_column_selector(dtype_include=np.number)),
                      ('categorical', SelectKBest(score_func=chi2, k="all"),
                       make_column_selector(dtype_include="category"))])

    fe_column_transformer.fit(X, y)
    X = fe_column_transformer.transform(X)

    return X, y
Example #10
0
def dataset_transform():
    return ColumnTransformer(
        [
            ("scaler", StandardScaler(),
             make_column_selector(dtype_include="number")),
            (
                "encoder",
                OneHotEncoder(handle_unknown="ignore"),
                make_column_selector(dtype_include=object),
            ),
        ],
        remainder="passthrough",
    )
Example #11
0
    def __init__(self, X, y):
        '''
        This creates the X_train, X_test, y_train, y_test arrays
        
        self.X_train
        self.X_test
        self.y_train
        self.y_test
        
        This also creates the simple preprocessing object
        A column transformer with two sides able to automatically handle
        numerical and categeorical data.
        
        Default Numerical Simple Inputter uses strategy 'mean'
        Default Categoerical Simple Inputter uses fill value 'other'
        
        self.preprocessing
        '''
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        # Below is the basic preprocessing pipeline

        # https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

        # set up numerical pipeline
        numeric_transformer = Pipeline(
            steps=[('num_imputer', SimpleImputer(
                strategy='mean')), ('num_scaler', StandardScaler())])

        # Set up categorical papeline
        categorical_transformer = Pipeline(
            steps=[('cat_imputer',
                    SimpleImputer(strategy='constant', fill_value='Other')
                    ), ('cat_onehot', OneHotEncoder(handle_unknown='ignore')
                        ), ('cat_scaler', StandardScaler(with_mean=False))])

        # set up preprocessing column transformer
        preprocessing = ColumnTransformer(
            transformers=[('num', numeric_transformer,
                           make_column_selector(dtype_include=np.number)),
                          ('cat', categorical_transformer,
                           make_column_selector(dtype_include='object'))])

        self.preprocessing = preprocessing
Example #12
0
    def __init__(self, return_df=True):
        self.return_df = return_df

        self.impute_median = SimpleImputer(strategy='median')
        self.impute_const = SimpleImputer(strategy='constant')
        self.ss = StandardScaler()
        self.ohe = OneHotEncoder(handle_unknown='ignore')

        self.num_cols = make_column_selector(dtype_include='number')
        self.cat_cols = make_column_selector(dtype_exclude='number')

        self.prerocessor = make_column_transformer(
            (make_pipeline(self.impute_median, self.ss), self.num_cols),
            (make_pipeline(self.impute_const, self.ohe), self.cat_cols),
        )
def get_pipeline(model, impute_cat='default', impute_num = 'default', scale='default',onehot='default',remove_outliers='default'):
    # in essence this splits the input into a categorical pipeline and a numeric pipeline
    # merged with a ColumnTransformer
    # on top a model is plugged (within OutlierExtractor if remove_outliers = True)
    # this works very nicely!

    cat_steps = []
    if impute_cat=='default':
        cat_steps.append(('impute_cat', DFSimpleImputer(strategy='constant',fill_value='None')))
    elif impute_cat:
        cat_steps.append(('impute_cat', impute_cat))
    
    if onehot == 'default':
        cat_steps.append(('cat_to_num', DFGetDummies()))
    elif onehot: 
        cat_steps.append(('cat_to_num', onehot))
        # equal to: cat_steps.append(('cat_to_num', DFOneHotEncoder(handle_unknown="ignore")))
    categorical_transformer = Pipeline(steps=cat_steps)

    num_steps = []
    if impute_num == 'default':
        num_steps.append(('impute_num', DFSimpleImputer(strategy='mean')))
    elif impute_num:
        num_steps.append(('impute_num', impute_num))
    
    if scale == 'default': 
        num_steps.append(('scale_num', DFStandardScaler()))
    elif scale:
        num_steps.append(('scale_num', scale))

    numeric_transformer = Pipeline(steps=num_steps)

    col_trans = DFColumnTransformer(transformers=[
        ('numeric', numeric_transformer, make_column_selector(dtype_include=np.number)),
        ('category', categorical_transformer, make_column_selector(dtype_exclude=np.number)),
        ])
    
    preprocessor_steps = [('col_trans', col_trans)]
    preprocessor = Pipeline(steps=preprocessor_steps,memory=memory)

    final_pipe = [('preprocess', preprocessor)]
    if remove_outliers == 'default': 
        final_pipe.append(('model',model))
    elif remove_outliers:
        final_pipe.append(('model',remove_outliers)) # DFOutlierExtractor(model, corruption=0.005)

    return Pipeline(steps=final_pipe)
    
def run_cv_model(model, X, y):
    '''
    Runs CrossValidation for a model and dataset
    :param model: model to run
    :param X: features
    :param y: target
    :return: void
    '''
    one_hot_encoder = make_column_transformer(
        (OneHotEncoder(sparse=False, handle_unknown='ignore'),
         make_column_selector(dtype_include='category')),
        remainder='passthrough')
    # pipeline = Pipeline(one_hot_encoder, model)
    pipeline = make_pipeline(one_hot_encoder, model)
    cv_results = cross_validate(pipeline,
                                X,
                                y,
                                cv=4,
                                scoring='neg_root_mean_squared_error',
                                verbose=1,
                                n_jobs=6)
    # print(sorted(cv_results.keys()))
    print("Model: " + str(model))
    print("test_score")
    print(cv_results['test_score'])
    print("average: ", np.average(cv_results['test_score']))
    return cv_results['test_score']
Example #15
0
    def __init__(self, max_features_to_select=0, n_jobs: int = -1):
        """Initialize the class.

        Parameters
        ----------
        n_jobs
            Number of parallel processes to use for cross-validation.
            If `n_jobs == -1` (default), use all available CPUs.
        """
        self.max_features_to_select = max_features_to_select
        self.n_jobs = n_jobs
        
        transformer = ColumnTransformer([('scale', StandardScaler(),
                                          make_column_selector(dtype_include=np.floating))],
                                        remainder="passthrough")
                                        
        logistic = LogisticRegressionCV(class_weight="balanced",
                                        scoring="roc_auc",
                                        solver="lbfgs",
                                        max_iter=1000,
                                        n_jobs=self.n_jobs)
        if self.max_features_to_select > 0:
            select = SelectMRMRe()
            pipe = make_pipeline(transformer, select, logistic)
            param_grid = {"selectmrmre__n_features": np.arange(2, self.max_features_to_select + 1)}
            self.model = GridSearchCV(pipe, param_grid, n_jobs=self.n_jobs)
        else:
            self.model = make_pipeline(transformer, logistic)
Example #16
0
    def __init__(self, max_features_to_select=0, n_jobs: int = -1):
        """Initialize the class.

        Parameters
        ----------
        n_jobs
            Number of parallel processes to use for cross-validation.
            If `n_jobs == -1` (default), use all available CPUs.
        """
        self.max_features_to_select = max_features_to_select
        self.n_jobs = n_jobs
        self.transformer = ColumnTransformer([('scale', StandardScaler(),
                                                make_column_selector(dtype_include=np.floating))],
                                             remainder="passthrough")
        CoxRegression = sklearn_adapter(CoxPHFitter,
                                        event_col="death",
                                        predict_method="predict_partial_hazard")
        cox = CoxRegression(step_size = 0.5)
        param_grid = {"sklearncoxphfitter__penalizer": 10.0**np.arange(-2, 3)}
        if self.max_features_to_select > 0:
            select = SelectMRMRe(target_col="death")
            # can't put CoxRegression in the pipeline since sklearn
            # transformers cannot return data frames
            pipe = make_pipeline(select, cox)
            param_grid["selectmrmre__n_features"] = np.arange(2, self.max_features_to_select + 1)
        else:
            pipe = make_pipeline(cox)

        # XXX lifelines sklearn adapter does not support parallelization
        # for now, need to find a better workaround
        self.model = GridSearchCV(pipe, param_grid, n_jobs=1)
def full_training(model, X, y):
    '''
    Trains a model over full split. Prints error adn plots estimations for test.
    :param model: model to train
    :param X: features
    :param y: labels
    :return: void
    '''
    one_hot_encoder = make_column_transformer(
        (OneHotEncoder(sparse=False, handle_unknown='ignore'),
         make_column_selector(dtype_include='category')),
        remainder='passthrough')
    pipeline = make_pipeline(one_hot_encoder, model)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    model_predict = pipeline.fit(X_train, y_train)
    predictions = model_predict.predict(X_test)

    # The coefficients
    # print('Coefficients: \n', model_predict[1].coef_)
    # The mean squared error
    print('Mean squared error: %.2f' % mean_squared_error(y_test, predictions))
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f' % r2_score(y_test, predictions))
    sns.regplot(y_test, predictions)
    st.pyplot()
def select_dtype_data(df,
                      dtype: Union[str, list] = 'NUMERIC',
                      return_type='pandas'):
    if isinstance(dtype, str):
        if dtype == 'NUMERIC':
            num_ndarr = num_selector.fit_transform(df)
            if return_type == 'pandas':
                return pd.DataFrame(num_ndarr,
                                    columns=num_selector.get_feature_names())
            else:
                return num_ndarr
        if dtype == 'CATEGORICAL':
            num_ndarr = cat_selector.fit_transform(df)
            if return_type == 'pandas':
                return pd.DataFrame(num_ndarr,
                                    columns=cat_selector.get_feature_names())
            else:
                return num_ndarr
    else:
        data_selector = make_column_transformer(
            ('passthrough', make_column_selector(dtype_include=dtype)),
            remainder='drop')

        num_ndarr = data_selector.fit_transform(df)
        if return_type == 'pandas':
            return pd.DataFrame(num_ndarr,
                                columns=data_selector.get_feature_names())
        else:
            return num_ndarr
Example #19
0
 def __init__(self, p: int, horizon: int):
     features = [
         tuple((f"s{i}", Shift(i),
                make_column_selector(dtype_include=np.number)))
         for i in range(1, p + 1)
     ]
     model = GAR(LinearRegression())
     super().__init__(features=features, horizon=horizon, model=model)
Example #20
0
    def __init__(self, data_fn =None , df=None , **kwargs)->None : 
        self._logging = watexlog().get_watex_logger(self.__class__.__name__)
        
        self._data_fn = data_fn 
        self._df =df   
        
        self.categorial_features =kwargs.pop('categorial_features', None)
        self.numerical_features =kwargs.pop('numerical_features', None)
        
        self.target =kwargs.pop('target', 'flow')
        self._drop_features = kwargs.pop('drop_features', ['lwi'])
        self.random_state = kwargs.pop('random_state', 0)
        self.default_estimator = kwargs.pop('default_estimator', 'svc')
        
        self._df_cache =None 
        self._features = None 
        
        self.y = None 
        self.X = None 
        self.X_train =None 
        self.X_test = None 
        self.y_train =None 
        self.y_test =None 
        

        self._num_column_selector = make_column_selector(
            dtype_include=np.number)
        self._cat_colum_selector =make_column_selector(
            dtype_exclude=np.number)
        self._features_engineering =PolynomialFeatures(
            10, include_bias=False) 
        self._selectors= SelectKBest(f_classif, k=4) 
        self._scalers =RobustScaler()
        self._encodages =OneHotEncoder()
        
        self._select_estimator_ =None 
        
        
        for key in kwargs.keys(): 
            setattr(self, key, kwargs[key])

        if self._data_fn is not None: 
            self.data_fn = self._data_fn 

        if self.df is not None : 
            self._read_and_encode_catFeatures()
Example #21
0
def get_col_transf():
    num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                         ('scaler', StandardScaler()),
                         ('poly', 'passthrough')])

    cat_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder()),
    ])

    col_t = ColumnTransformer([
        ('num', num_pipe,
         make_column_selector(dtype_include=['int64', 'float64'])),
        ('cat', cat_pipe, make_column_selector(dtype_include='object'))
    ])

    return col_t
Example #22
0
def create_pipeline(params: dict = None):
    """
    Create sklearn.pipeline.Pipeline

    Parameters
    ----------
    params : dict
        dictionary of parameters for the pipeline

    Returns
    -------
    sklearn.pipeline.Pipeline
    """

    # pipeline for numeric variables
    p_num = Pipeline([("num_nan_ind", AddMissingIndicator(missing_only=True)),
                      ("rmmean", MeanMedianImputer()),
                      ("drop_quasi_constant", DropConstantFeatures(tol=0.97))])

    # pipeline for categorical variables
    p_cat = Pipeline([("fill_cat_nas",
                       CategoricalImputer(fill_value='MISSING')),
                      ("rlc", RareLabelEncoder()),
                      ("one_hot_encoder", OneHotEncoder())])

    # list of pipelines to combine
    transformers = [("num", p_num,
                     make_column_selector(dtype_include=np.number)),
                    ("cat", p_cat, make_column_selector(dtype_include=object))]

    # combine pipelines and add XGBClassifier
    col_transforms = ColumnTransformer(transformers)
    p = Pipeline([("col_transformers", col_transforms),
                  ("xgb",
                   XGBClassifier(min_child_weight=1,
                                 gamma=0,
                                 objective='binary:logistic',
                                 nthread=4,
                                 scale_pos_weight=1,
                                 seed=1,
                                 gpu_id=0,
                                 tree_method='gpu_hist'))])

    if params:
        p.set_params(**params)
    return p
 def test_pipeline_make_column_selector(self):
     X = pandas.DataFrame({
         'city': ['London', 'London', 'Paris', 'Sallisaw'],
         'rating': [5, 3, 4, 5]})
     X['rating'] = X['rating'].astype(numpy.float32)
     ct = make_column_transformer(
         (StandardScaler(), make_column_selector(
             dtype_include=numpy.number)),
         (OneHotEncoder(), make_column_selector(
             dtype_include=object)))
     expected = ct.fit_transform(X)
     onx = to_onnx(ct, X, target_opset=TARGET_OPSET)
     sess = InferenceSession(onx.SerializeToString())
     names = [i.name for i in sess.get_inputs()]
     got = sess.run(None, {names[0]: X[names[0]].values.reshape((-1, 1)),
                           names[1]: X[names[1]].values.reshape((-1, 1))})
     assert_almost_equal(expected, got[0])
Example #24
0
    def create_pipeline(self):
        preprocessing_pipeline = Pipeline([
            ('drop_columns', FunctionTransformer(self.drop_columns, kw_args={'columns_to_drop': FEATURES_TO_DROP})),
            # Need to convert to string otherwise leads to error when imputing object data:
            ('convert_object_columns_to_string', FunctionTransformer(self.convert_object_columns_to_string))
        ])

        # Pipeline of operations to perform on any object columns in DataFrame
        object_pipeline = Pipeline(
            [
                ('most_frequent_imputer', SimpleImputer(strategy='most_frequent')),  # Very slow
                ('ohe', OneHotEncoder())
            ]
        )

        # Pipeline of operations to perform on any numeric columns in DataFrame
        numeric_pipeline = Pipeline(
            [
                ('mean_imputer', SimpleImputer(strategy='mean')),
                ('min_max_scalar', MinMaxScaler())
            ]
        )

        full_pipeline = Pipeline(
            [
                (
                    'process_data',
                    ColumnTransformer(
                        [
                            ('numeric_processing', numeric_pipeline, make_column_selector(dtype_include=np.number)),
                            ('object_processing', object_pipeline, make_column_selector(dtype_include=object))
                        ]
                    )
                )
            ]
        )

        end_to_end_pipeline = Pipeline(
            [
                ('preprocessing', preprocessing_pipeline),
                ('processing', full_pipeline),
                ('model', lightgbm.LGBMClassifier())
            ]
        )

        return end_to_end_pipeline
Example #25
0
def make_column_transformer(num_missing_impute_strategy='mean'):
    """[create a data preprocess pipeline using sklearn pipeline]
    Todo
    """
    num_imputer = Pipeline([
        ("imputer",
         SimpleImputer(strategy=num_missing_impute_strategy,
                       add_indicator=False))
    ])
    cat_ohe = Pipeline([
        ("cat_imputer", SimpleImputer(strategy='constant', fill_value='NA')),
        ('ohe', OneHotEncoder(dtype=np.int, handle_unknown='ignore'))
    ])
    return ColumnTransformer(
        [('imp', num_imputer, make_column_selector(dtype_include=np.number)),
         ('ohe', cat_ohe,
          make_column_selector(dtype_include=['object', 'category']))],
        remainder='passthrough')
Example #26
0
 def __init__(self, horizon: int, seasonal_length: int):
     features = [
         ("s1", Shift(0), make_column_selector()),
     ]
     super().__init__(
         features=features,
         horizon=horizon,
         model=SeasonalNaiveForecaster(seasonal_length),
     )
Example #27
0
def make_normalizer_column_transformer(normalizations: Dict[SensorComponent,
                                                            Normalization]):
    transformers = []
    for sensor_component, normalization in normalizations.items():
        selector = make_column_selector(pattern=sensor_component +
                                        MATCH_REST_REGEX)
        transformers.append(
            (sensor_component, get_normalizer(normalization), selector))
    return PandasColumnTransformer(transformers)
Example #28
0
def test_column_selector():
    X = pd.DataFrame({
        "country": ["GB", "GB", "FR", "US"],
        "city": ["London", "London", "Paris", "Sallisaw"],
        "int": [5, 3, 4, 5],
    })
    ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=np.number)),
        (OneHotEncoder(), make_column_selector("city")),
    )
    expected = ct.fit_transform(X)

    ct = make_column_transformer(
        (StandardScaler(), ColumnSelector(AllNumeric())),
        (OneHotEncoder(), ColumnSelector(StartsWith("c") & ~AnyOf("country"))),
    )
    actual = ct.fit_transform(X)

    assert_array_equal(actual, expected)
def train_and_persist():
    # Read in data into a Pandas DataFrame
    df = pd.read_csv("hour.csv", parse_dates=["dteday"])

    # Assign features to independent (X) and predicted (y) variables
    X = df.drop(columns=["instant", "cnt", "casual", "registered"])
    y = df["cnt"]

    # using ffill_missing to make imputter with forward fill
    ffiller = FunctionTransformer(ffill_missing)

    # Make weather imputter pipeline for later use
    weather_enc = make_pipeline(
        ffiller,
        OrdinalEncoder(handle_unknown="use_encoded_value",
                       unknown_value=X["weathersit"].nunique()),
    )

    # Make column transformer for imputtation and encoding process
    ct = make_column_transformer(
        (ffiller, make_column_selector(dtype_include=np.number)),
        (weather_enc, ["weathersit"]),
    )

    # Make preprocessing object for Feature Engineering
    preprocessing = FeatureUnion([("is_weekend",
                                   FunctionTransformer(is_weekend)),
                                  ("year", FunctionTransformer(year)),
                                  ("column_transform", ct)])

    # Define Pipeline to separate preprocessing and modelling
    reg = Pipeline([("preprocessing", preprocessing),
                    ("model", RandomForestRegressor())])

    # Train, test split: Train is before 10/2012 and Test is after 10/2012
    X_train, y_train = X.loc[X["dteday"] < "2012-10"], y.loc[
        X["dteday"] < "2012-10"]

    X_test, y_test = X.loc["2012-10" <= X["dteday"]], y.loc[
        "2012-10" <= X["dteday"]]

    # Train the model
    reg.fit(X_train, y_train)

    #     # Evaluate to get R-squared
    #     reg.score(X_test, y_test)

    #     # Predict
    #     y_pred = reg.predict(X_test)

    # Create the joblib file
    joblib.dump(reg, "biking.joblib")

    print("Model trained successfully")
    def fit(self, X, y=None):
        from sklearn.compose import ColumnTransformer
        from sklearn.preprocessing import OrdinalEncoder

        df = pd.DataFrame(data=X, index=range(X.shape[0]), columns=range(X.shape[1])).infer_objects()
        categorical = make_column_selector(dtype_exclude=np.number)

        self.estimator_ = ColumnTransformer(
            [('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical)],
            remainder='passthrough')
        self.estimator_.fit(df, y)
        return self