def test_time_encoder(): df = get_data(nrows=1) X = df.drop(columns='fare_amount') y = df.fare_amount time_enc = TimeFeaturesEncoder('pickup_datetime') time_features = time_enc.fit_transform(X, y) assert time_features.shape[1] == 4, "shape[1] is not 4"
def set_pipeline(self, estimator): """defines the pipeline as a class attribute""" #distance pipeline dist_pipe = Pipeline([('dist_transformer', DistanceTransformer()), ('standardizer', StandardScaler())]) #time pipeline time_pipe = Pipeline([('time_extractor', TimeFeaturesEncoder('pickup_datetime')), ('encoder', OneHotEncoder())]) #split distance and time cols dist_cols = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ] #dfc_cols = ['pickup_longitude', 'pickup_latitude'] time_cols = ['pickup_datetime'] # create preprocessing pipeline # if self.iteration > 0: # man_dist_pipe = Pipeline([('dist_centre', ManhattanDistance()), # ('standardizer', StandardScaler())]) # preproc_pipe = ColumnTransformer([('manhattan', man_dist_pipe, dist_cols), # ('time', time_pipe, time_cols)]) # else: preproc_pipe = ColumnTransformer([('distance', dist_pipe, dist_cols), ('time', time_pipe, time_cols)]) #model pipeline full_pipe = Pipeline([('preprocessing', preproc_pipe), ('model', estimator)]) self.pipeline = full_pipe return self
def set_pipeline(self, estimator): """defines the pipeline as a class attribute""" # Features: Distance feat_distance = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] pipe_distance = Pipeline([('to_distance', DistanceTransformer()), ('std_scale', StandardScaler())]) # Features: Time feat_time = ['pickup_datetime'] pipe_time = Pipeline([ ('to_time_feat', TimeFeaturesEncoder('pickup_datetime')), ('ohe_encode', OneHotEncoder(handle_unknown="ignore")) ]) # Preprocessing pipe_cols = ColumnTransformer([('pipe_distance', pipe_distance, feat_distance), ('pipe_time', pipe_time, feat_time)]) pipe_preproc = Pipeline([('preproc', pipe_cols)]) # Model self.pipeline = Pipeline([('preproc', pipe_preproc), ('model', estimator)]) return self.pipeline
def set_pipeline(self): """defines the pipeline as a class attribute""" pipe_time = Pipeline([ ('features', TimeFeaturesEncoder('pickup_datetime')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) X_distance = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ] X_time = ['pickup_datetime'] pipe_distance = Pipeline([('distance_transformer', DistanceTransformer()), ('standardize', StandardScaler())]) pipe_preproc = ColumnTransformer([("pipe_distance", pipe_distance, X_distance), ("pipe_time", pipe_time, X_time)]) # Add the model of your choice to the pipeline final_pipe = Pipeline([('pipelines_aggregated', pipe_preproc), ('model', LinearRegression())]) # display the pipeline with model return final_pipe
def set_pipeline(self): dist = self.kwargs.get("distance_type", "haversine") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features"]) # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) # Add new feature engineer Above #pipe_direction = #pipe_distance_to_center = # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), #('direction', pipe_direction, list(DIST_ARGS.values())), #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], )
def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipeline = Pipeline([('DistanceTransformer', DistanceTransformer()), ('Scaler', RobustScaler())]) time_pipeline = Pipeline([ ('TimeFeaturesEncoder', TimeFeaturesEncoder('pickup_datetime')), ('Encoder', OneHotEncoder(handle_unknown='ignore')) ]) distance_2_pipeline = Pipeline([('DistanceToCenterTransformer', DistanceToCenterTransformer()), ('Scaler', RobustScaler())]) dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] time_cols = ['pickup_datetime'] dist_2_cols = ['dropoff_latitude', 'dropoff_longitude'] preproc = ColumnTransformer([ ('distance', distance_pipeline, dist_cols), ('time', time_pipeline, time_cols), ('distance_center', distance_2_pipeline, dist_2_cols) ]) model_pipeline = Pipeline([('preproc', preproc), ('regressor', self.model)]) return model_pipeline
def set_pipeline(self): '''defines the pipeline as a class attribute''' # create dist pipeline dist_pipe = Pipeline([('dist_trans', DistanceTransformer()), ('scaler', StandardScaler())]) # create time pipeline time_pipe = Pipeline([('time_features', TimeFeaturesEncoder('pickup_datetime')), ('cat_transformer', OneHotEncoder())]) # create preprocessing pipeline time_features = ['pickup_datetime'] dist_features = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] preprocessor = ColumnTransformer( [('dist_pipeline', dist_pipe, dist_features), ('time_pipeline', time_pipe, time_features)], remainder='drop') # Add pipeline containing the preprocessing and the regression model pipeline = Pipeline([('preprocessing', preprocessor), ('linear_regression', LassoCV())]) self.pipeline = pipeline
def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler()) # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())], memory=memory)
def set_pipeline(self): """defines the pipeline as a class attribute""" '''returns a pipelined model''' dist_preprocess = Pipeline([('transformer', DistanceTransformer()), ('scaler', RobustScaler())]) time_preprocess = Pipeline([('transformer', TimeFeaturesEncoder()), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]) time_column = ['pickup_datetime'] dist_columns = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ] # create preprocessing pipeline preprocess = ColumnTransformer([('time', time_preprocess, time_column), ('dist', dist_preprocess, dist_columns) ]) self.pipeline = Pipeline([('preprocess', preprocess), ('estimator', RandomForestRegressor())]) return self.pipeline
def set_pipeline(self): '''returns a pipelined model''' pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler()) pipe_time = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), StandardScaler()) dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] time_cols = ['pickup_datetime'] preprocessing = ColumnTransformer([('time', pipe_time, time_cols), ('distance', pipe_distance, dist_cols)]) pipe_final = Pipeline( steps=[('preprocessor', preprocessing), ('regressor', LinearRegression())]) self.mlflow_client.log_param(self.mlflow_run.info.run_id, "model", "linear") self.pipeline = pipe_final
def set_pipeline(self): """defines the pipeline as a class attribute""" dist_pipline = make_pipeline(DistanceTransformer(), StandardScaler()) timpe_pipe = make_pipeline(TimeFeaturesEncoder('pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) preproc_pipe = make_column_transformer((dist_pipline, [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ]), (timpe_pipe, ['pickup_datetime', 'pickup_latitude'])) self.pipeline = make_pipeline(preproc_pipe, LinearRegression())
def set_pipeline(self): """defines the pipeline as a class attribute""" dist_pipe = make_pipeline(DistanceTransformer(), StandardScaler()) time_pipe = make_pipeline(TimeFeaturesEncoder(), OneHotEncoder()) preprocessor = ColumnTransformer([('dist_transformer', dist_pipe, [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ]), ('time_transformer', time_pipe, ['pickup_datetime'])]) model = LinearRegression() self.pipeline = make_pipeline(preprocessor, model) return self.pipeline
def set_pipeline(self): distpipe = Pipeline([('imputer', DistanceTransformer()), ('scaler', StandardScaler())]) timepipe = Pipeline([ ('imputer', TimeFeaturesEncoder('pickup_datetime')), ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer([('dist_transformer', distpipe, [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ]), ('time_transformer', timepipe, ['pickup_datetime'])]) self.pipeline = Pipeline([('preprocessing', preprocessor), ('linear_regression', LinearRegression())])
def set_pipeline(self): """defines the pipeline as a class attribute""" dist_pipe = Pipeline([('dist_trans', DistanceTransformer()), ('stdscaler', StandardScaler())]) time_pipe = Pipeline([('time_enc', TimeFeaturesEncoder('pickup_datetime')), ('ohe', OneHotEncoder(handle_unknown='ignore'))]) preproc_pipe = ColumnTransformer([('distance', dist_pipe, [ "pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude' ]), ('time', time_pipe, ['pickup_datetime'])], remainder="drop") self.pipe = Pipeline([('preproc', preproc_pipe), ('linear_model', LinearRegression())])
def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipe = Pipeline([("dist_transformer", DistanceTransformer()),('scaler', StandardScaler())]) time_pipe = Pipeline([('time_encoder', TimeFeaturesEncoder("pickup_datetime")),('encoder', OneHotEncoder(handle_unknown='ignore'))]) dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] time_cols = ['pickup_datetime'] preprocessor = ColumnTransformer([ ('distance_pipe', distance_pipe, dist_cols), ('time_pipe', time_pipe, time_cols)], remainder="drop") self.pipeline = Pipeline(steps=[('preprocessor', preprocessor), ("model", LinearRegression())]) return pipeline
def set_pipeline(self): time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) features_encoder = ColumnTransformer([ ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())), ('time_features', time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())) ]) self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())])
def set_pipeline(self): '''returns a pipelined model''' dist_pipe = Pipeline([('dist_trans', DistanceTransformer()), ('stdscaler', StandardScaler())]) time_pipe = Pipeline([('time_enc', TimeFeaturesEncoder('pickup_datetime')), ('ohe', OneHotEncoder(handle_unknown='ignore'))]) preproc_pipe = ColumnTransformer([('distance', dist_pipe, [ "pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude' ]), ('time', time_pipe, ['pickup_datetime'])], remainder="drop") pipe = Pipeline([('preproc', preproc_pipe), ('Xgbregressor', XGBRegressor())]) self.pipeline = pipe return self
def set_pipeline(self): pipe_time_feature = make_pipeline( TimeFeaturesEncoder(time_column="pickup_datetime"), OneHotEncoder()) pipe_distance = make_pipeline( DistanceTransformer(distance_type=self.distance_type, **DIST_ARGS)) features_encoder = ColumnTransformer([ ("distance_feature", pipe_distance, list(DIST_ARGS.values())), ("time_feature", pipe_time_feature, ["pickup_datetime"]) ]) self.pipeline = Pipeline( steps=[("features_encoder", features_encoder), ("model", self.get_estimator())])
def set_pipeline(self): """defines the pipeline as a class attribute""" dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] time_cols = ['pickup_datetime'] pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler()) pipe_time = make_pipeline(TimeFeaturesEncoder('pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) preproc = make_column_transformer((pipe_distance, dist_cols), (pipe_time, time_cols)) self.pipeline = make_pipeline(preproc, RandomForestRegressor())
def set_pipeline(self): memory = self.kwargs.get('pipeline_memory', None) dist = self.kwargs.get('distance_type', 'haversine') feateng_steps = self.kwargs.get('feateng', ['distance', 'time_features']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), StandardScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), StandardScaler()) # Combine pipes feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), #('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)])
def set_pipeline(self): """defines the pipeline as a class attribute""" pipe_distance = make_pipeline(DistanceTransformer(), StandardScaler()) pipe_time = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder()) dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] time_cols = ['pickup_datetime'] feat_eng_bloc = ColumnTransformer([('distance', pipe_distance, dist_cols), ('time', pipe_time, time_cols)]) self.pipeline = Pipeline( steps=[('feat_eng_bloc', feat_eng_bloc), ('regressor', LinearRegression())])
def set_pipeline(self): """defines the pipeline as a class attribute""" pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler()) pipe_time = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] time_cols = ['pickup_datetime'] feat_eng_bloc = ColumnTransformer([('time', pipe_time, time_cols), ('distance', pipe_distance, dist_cols)]) self.pipeline = Pipeline( steps=[('feat_eng_bloc', feat_eng_bloc), ('regressor', RandomForestRegressor())]) return self.pipeline
def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipe = Pipeline([ ('distance',DistanceTransformer()), ('scaler', StandardScaler()) ]) time_pipe = Pipeline([ ('timefeatures', TimeFeaturesEncoder("pickup_datetime")), ('encoding', OneHotEncoder(handle_unknown='ignore')) ]) preproc = ColumnTransformer([ ('distance', distance_pipe, ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']), ('time', time_pipe,['pickup_datetime']) ]) self.pipeline = Pipeline([ ('preproc',preproc), ('KNN',KNeighborsRegressor()) ])
def set_pipeline(self): """set pipeline""" # create distance pipeline dist_pipe = make_pipeline(DistanceTransformer(), StandardScaler()) # create distance pipeline time_pipe = make_pipeline(TimeFeaturesEncoder('pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) # create preprocessing pipeline prepro_pipe = ColumnTransformer([('distance', dist_pipe, [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude' ]), ('time', time_pipe, ['pickup_datetime'])]) # display preprocessing pipeline prepro_pipe # Add the model of your choice to the pipeline from sklearn.linear_model import SGDRegressor self.pipeline = Pipeline([('preprocessing', prepro_pipe), ('sgd', SGDRegressor())])
def set_pipeline(self): """defines the pipeline as a class attribute""" pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler()) pipe_time = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder()) time_cols = ['pickup_datetime'] dist_cols = [ 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] preprocessing = ColumnTransformer([('time', pipe_time, time_cols), ('distance', pipe_distance, dist_cols)]) pipe_cols = Pipeline( steps=[('preprocessing', preprocessing), ('regressor', LinearRegression())]) return pipe_cols
def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler()) time_pipe = make_pipeline( TimeFeaturesEncoder(time_column = 'pickup_datetime'), OneHotEncoder(handle_unknown = 'ignore') ) preprocessor = ColumnTransformer([ ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']), ('time_trans', time_pipe, ['pickup_datetime'])]) model_pipeline = Pipeline(steps = [('preprocessing', preprocessor), ('regressor', LinearRegression())]) self.pipeline = model_pipeline return self
def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features"]) if memory: memory = mkdtemp() time_pipe = Pipeline([ ("time_enc", TimeFeaturesEncoder("pickup_datetime")), ("ohe", OneHotEncoder(handle_unknown="ignore")), ]) dist_pipe = Pipeline([ ("dist_trans", DistanceTransformer(distance_type=dist, **DIST_ARGS)), ("stdscaler", StandardScaler()), ]) center_pipe = Pipeline([("distance_center", DistanceToCenter()), ("stdscaler", StandardScaler())]) geohash_pipe = Pipeline([("deohash_add", AddGeohash()), ("hash_encode", ce.HashingEncoder())]) direction_pipe = Pipeline([("direction_add", Direction()), ("stdscaler", StandardScaler())]) feateng_blocks = [ ("distance", dist_pipe, list(DIST_ARGS.values())), ("time_features", time_pipe, ["pickup_datetime"]), #("geohash", geohash_pipe, list(DIST_ARGS.values())), ("direction", direction_pipe, list(DIST_ARGS.values())), ("distance_to_center", center_pipe, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline( steps=[("features", features_encoder), ("df_clener", DataframeCleaner(verbose=False)), ("rgs", self.get_estimator())], memory=memory, )
def set_pipeline(self): time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) dist_args = dict(start_lat="pickup_latitude", start_lon="pickup_longitude", end_lat="dropoff_latitude", end_lon="dropoff_longitude") features_encoder = ColumnTransformer([ ('distance', DistanceTransformer(**dist_args), list(dist_args.values())), ('time_features', time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(dist_args.values())) ]) self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())])
def set_pipeline(self): """defines the pipeline as a class attribute""" pipe_distance = Pipeline(steps=[( 'distance_transformer', DistanceTransformer()), ('distance_scaling', StandardScaler())]) pipe_time = Pipeline([ ('time_transformer', TimeFeaturesEncoder("pickup_datetime")), ('time_encode', OneHotEncoder(handle_unknown='ignore', sparse=False)) ]) pipe_passengers = Pipeline([('passenger_scaler', RobustScaler())]) distance_columns = [ "pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude" ] time_columns = ['pickup_datetime'] passenger_columns = ['passenger_count'] preproc_pipe = ColumnTransformer( [('distance', pipe_distance, distance_columns), ('time', pipe_time, time_columns), ('passenger', pipe_passengers, passenger_columns)], remainder='drop') if self.estimator == 'Lasso': self.pipeline = Pipeline([('transformer', preproc_pipe), ('regressor', LassoCV(cv=5, n_alphas=5))]) elif self.estimator == 'XGBoost': self.pipeline = Pipeline([('transformer', preproc_pipe), ('regressor', XGBRegressor(n_estimators=300, learning_rate=0.05))]) else: self.estimator == 'XGBoost' self.pipeline = Pipeline([('transformer', preproc_pipe), ('regressor', XGBRegressor(n_estimators=300, learning_rate=0.05))])
def set_pipeline(self): """defines the pipeline as a class attribute""" # distance pipe_dist = Pipeline([ ('distance', DistanceTransformer()), ('scaler', StandardScaler())]) # time pipe_time = Pipeline([ ('features', TimeFeaturesEncoder('pickup_datetime')), ('OneHot', OneHotEncoder())]) # preproc dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'] time_cols = ['pickup_datetime'] preprocess_pipe = ColumnTransformer([ ('dist', pipe_dist, dist_cols), ('time', pipe_time, time_cols)]) # model pipeline self.pipeline = Pipeline([ ('preprocessing', preprocess_pipe), ('regression', LinearRegression())])