Example #1
0
def test_time_encoder():
    df = get_data(nrows=1)
    X = df.drop(columns='fare_amount')
    y = df.fare_amount
    time_enc = TimeFeaturesEncoder('pickup_datetime')
    time_features = time_enc.fit_transform(X, y)
    assert time_features.shape[1] == 4, "shape[1] is not 4"
Example #2
0
 def set_pipeline(self, estimator):
     """defines the pipeline as a class attribute"""
     #distance pipeline
     dist_pipe = Pipeline([('dist_transformer', DistanceTransformer()),
                           ('standardizer', StandardScaler())])
     #time pipeline
     time_pipe = Pipeline([('time_extractor',
                            TimeFeaturesEncoder('pickup_datetime')),
                           ('encoder', OneHotEncoder())])
     #split distance and time cols
     dist_cols = [
         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
         'dropoff_latitude'
     ]
     #dfc_cols = ['pickup_longitude', 'pickup_latitude']
     time_cols = ['pickup_datetime']
     # create preprocessing pipeline
     # if self.iteration > 0:
     #     man_dist_pipe = Pipeline([('dist_centre', ManhattanDistance()),
     #                               ('standardizer', StandardScaler())])
     #     preproc_pipe = ColumnTransformer([('manhattan', man_dist_pipe, dist_cols),
     #                                       ('time', time_pipe, time_cols)])
     # else:
     preproc_pipe = ColumnTransformer([('distance', dist_pipe, dist_cols),
                                       ('time', time_pipe, time_cols)])
     #model pipeline
     full_pipe = Pipeline([('preprocessing', preproc_pipe),
                           ('model', estimator)])
     self.pipeline = full_pipe
     return self
Example #3
0
    def set_pipeline(self, estimator):
        """defines the pipeline as a class attribute"""
        # Features: Distance
        feat_distance = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]

        pipe_distance = Pipeline([('to_distance', DistanceTransformer()),
                                  ('std_scale', StandardScaler())])
        # Features: Time
        feat_time = ['pickup_datetime']
        pipe_time = Pipeline([
            ('to_time_feat', TimeFeaturesEncoder('pickup_datetime')),
            ('ohe_encode', OneHotEncoder(handle_unknown="ignore"))
        ])

        # Preprocessing
        pipe_cols = ColumnTransformer([('pipe_distance', pipe_distance,
                                        feat_distance),
                                       ('pipe_time', pipe_time, feat_time)])
        pipe_preproc = Pipeline([('preproc', pipe_cols)])

        # Model
        self.pipeline = Pipeline([('preproc', pipe_preproc),
                                  ('model', estimator)])

        return self.pipeline
Example #4
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""

        pipe_time = Pipeline([
            ('features', TimeFeaturesEncoder('pickup_datetime')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        X_distance = [
            'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
            'dropoff_latitude'
        ]

        X_time = ['pickup_datetime']

        pipe_distance = Pipeline([('distance_transformer',
                                   DistanceTransformer()),
                                  ('standardize', StandardScaler())])

        pipe_preproc = ColumnTransformer([("pipe_distance", pipe_distance,
                                           X_distance),
                                          ("pipe_time", pipe_time, X_time)])

        # Add the model of your choice to the pipeline

        final_pipe = Pipeline([('pipelines_aggregated', pipe_preproc),
                               ('model', LinearRegression())])

        # display the pipeline with model

        return final_pipe
Example #5
0
    def set_pipeline(self):
        dist = self.kwargs.get("distance_type", "haversine")
        feateng_steps = self.kwargs.get("feateng",
                                        ["distance", "time_features"])

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist),
                                      StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        # Add new feature engineer Above
        #pipe_direction =
        #pipe_distance_to_center =

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            #('direction', pipe_direction, list(DIST_ARGS.values())),
            #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())], )
Example #6
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""

        distance_pipeline = Pipeline([('DistanceTransformer',
                                       DistanceTransformer()),
                                      ('Scaler', RobustScaler())])
        time_pipeline = Pipeline([
            ('TimeFeaturesEncoder', TimeFeaturesEncoder('pickup_datetime')),
            ('Encoder', OneHotEncoder(handle_unknown='ignore'))
        ])
        distance_2_pipeline = Pipeline([('DistanceToCenterTransformer',
                                         DistanceToCenterTransformer()),
                                        ('Scaler', RobustScaler())])
        dist_cols = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]
        time_cols = ['pickup_datetime']
        dist_2_cols = ['dropoff_latitude', 'dropoff_longitude']

        preproc = ColumnTransformer([
            ('distance', distance_pipeline, dist_cols),
            ('time', time_pipeline, time_cols),
            ('distance_center', distance_2_pipeline, dist_2_cols)
        ])

        model_pipeline = Pipeline([('preproc', preproc),
                                   ('regressor', self.model)])

        return model_pipeline
Example #7
0
    def set_pipeline(self):
        '''defines the pipeline as a class attribute'''
        # create dist pipeline
        dist_pipe = Pipeline([('dist_trans', DistanceTransformer()),
                              ('scaler', StandardScaler())])

        # create time pipeline
        time_pipe = Pipeline([('time_features',
                               TimeFeaturesEncoder('pickup_datetime')),
                              ('cat_transformer', OneHotEncoder())])

        # create preprocessing pipeline
        time_features = ['pickup_datetime']
        dist_features = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]

        preprocessor = ColumnTransformer(
            [('dist_pipeline', dist_pipe, dist_features),
             ('time_pipeline', time_pipe, time_features)],
            remainder='drop')

        # Add pipeline containing the preprocessing and the regression model
        pipeline = Pipeline([('preprocessing', preprocessor),
                             ('linear_regression', LassoCV())])
        self.pipeline = pipeline
Example #8
0
    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center'])
        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                           OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), RobustScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler())

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop")

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())], memory=memory)
Example #9
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        '''returns a pipelined model'''
        dist_preprocess = Pipeline([('transformer', DistanceTransformer()),
                                    ('scaler', RobustScaler())])
        time_preprocess = Pipeline([('transformer', TimeFeaturesEncoder()),
                                    ('encoder',
                                     OneHotEncoder(handle_unknown='ignore',
                                                   sparse=False))])

        time_column = ['pickup_datetime']
        dist_columns = [
            'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
            'dropoff_latitude'
        ]

        # create preprocessing pipeline
        preprocess = ColumnTransformer([('time', time_preprocess, time_column),
                                        ('dist', dist_preprocess, dist_columns)
                                        ])

        self.pipeline = Pipeline([('preprocess', preprocess),
                                  ('estimator', RandomForestRegressor())])

        return self.pipeline
Example #10
0
    def set_pipeline(self):
        '''returns a pipelined model'''

        pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())
        pipe_time = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            StandardScaler())

        dist_cols = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]
        time_cols = ['pickup_datetime']

        preprocessing = ColumnTransformer([('time', pipe_time, time_cols),
                                           ('distance', pipe_distance,
                                            dist_cols)])

        pipe_final = Pipeline(
            steps=[('preprocessor',
                    preprocessing), ('regressor', LinearRegression())])

        self.mlflow_client.log_param(self.mlflow_run.info.run_id, "model",
                                     "linear")
        self.pipeline = pipe_final
Example #11
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     dist_pipline = make_pipeline(DistanceTransformer(), StandardScaler())
     timpe_pipe = make_pipeline(TimeFeaturesEncoder('pickup_datetime'),
                                OneHotEncoder(handle_unknown='ignore'))
     preproc_pipe = make_column_transformer((dist_pipline, [
         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
         'dropoff_latitude'
     ]), (timpe_pipe, ['pickup_datetime', 'pickup_latitude']))
     self.pipeline = make_pipeline(preproc_pipe, LinearRegression())
Example #12
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     dist_pipe = make_pipeline(DistanceTransformer(), StandardScaler())
     time_pipe = make_pipeline(TimeFeaturesEncoder(), OneHotEncoder())
     preprocessor = ColumnTransformer([('dist_transformer', dist_pipe, [
         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
         'dropoff_latitude'
     ]), ('time_transformer', time_pipe, ['pickup_datetime'])])
     model = LinearRegression()
     self.pipeline = make_pipeline(preprocessor, model)
     return self.pipeline
Example #13
0
 def set_pipeline(self):
     distpipe = Pipeline([('imputer', DistanceTransformer()),
                          ('scaler', StandardScaler())])
     timepipe = Pipeline([
         ('imputer', TimeFeaturesEncoder('pickup_datetime')),
         ('encoder', OneHotEncoder(handle_unknown='ignore'))
     ])
     preprocessor = ColumnTransformer([('dist_transformer', distpipe, [
         'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
         'dropoff_latitude'
     ]), ('time_transformer', timepipe, ['pickup_datetime'])])
     self.pipeline = Pipeline([('preprocessing', preprocessor),
                               ('linear_regression', LinearRegression())])
Example #14
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     dist_pipe = Pipeline([('dist_trans', DistanceTransformer()),
                           ('stdscaler', StandardScaler())])
     time_pipe = Pipeline([('time_enc',
                            TimeFeaturesEncoder('pickup_datetime')),
                           ('ohe', OneHotEncoder(handle_unknown='ignore'))])
     preproc_pipe = ColumnTransformer([('distance', dist_pipe, [
         "pickup_latitude", "pickup_longitude", 'dropoff_latitude',
         'dropoff_longitude'
     ]), ('time', time_pipe, ['pickup_datetime'])],
                                      remainder="drop")
     self.pipe = Pipeline([('preproc', preproc_pipe),
                           ('linear_model', LinearRegression())])
Example #15
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        distance_pipe = Pipeline([("dist_transformer", DistanceTransformer()),('scaler', StandardScaler())])
        time_pipe = Pipeline([('time_encoder', TimeFeaturesEncoder("pickup_datetime")),('encoder', OneHotEncoder(handle_unknown='ignore'))])

        dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
        time_cols = ['pickup_datetime']

        preprocessor = ColumnTransformer([
        ('distance_pipe', distance_pipe, dist_cols),
        ('time_pipe', time_pipe, time_cols)], remainder="drop")

        self.pipeline = Pipeline(steps=[('preprocessor', preprocessor),
              ("model", LinearRegression())])
        return pipeline
Example #16
0
    def set_pipeline(self):

        time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                      OneHotEncoder(handle_unknown='ignore'))

        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())

        features_encoder = ColumnTransformer([
            ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())),
            ('time_features', time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values()))
        ])

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())])
Example #17
0
 def set_pipeline(self):
     '''returns a pipelined model'''
     dist_pipe = Pipeline([('dist_trans', DistanceTransformer()),
                           ('stdscaler', StandardScaler())])
     time_pipe = Pipeline([('time_enc',
                            TimeFeaturesEncoder('pickup_datetime')),
                           ('ohe', OneHotEncoder(handle_unknown='ignore'))])
     preproc_pipe = ColumnTransformer([('distance', dist_pipe, [
         "pickup_latitude", "pickup_longitude", 'dropoff_latitude',
         'dropoff_longitude'
     ]), ('time', time_pipe, ['pickup_datetime'])],
                                      remainder="drop")
     pipe = Pipeline([('preproc', preproc_pipe),
                      ('Xgbregressor', XGBRegressor())])
     self.pipeline = pipe
     return self
Example #18
0
    def set_pipeline(self):

        pipe_time_feature = make_pipeline(
            TimeFeaturesEncoder(time_column="pickup_datetime"),
            OneHotEncoder())
        pipe_distance = make_pipeline(
            DistanceTransformer(distance_type=self.distance_type, **DIST_ARGS))

        features_encoder = ColumnTransformer([
            ("distance_feature", pipe_distance, list(DIST_ARGS.values())),
            ("time_feature", pipe_time_feature, ["pickup_datetime"])
        ])

        self.pipeline = Pipeline(
            steps=[("features_encoder",
                    features_encoder), ("model", self.get_estimator())])
Example #19
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        dist_cols = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]
        time_cols = ['pickup_datetime']

        pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())
        pipe_time = make_pipeline(TimeFeaturesEncoder('pickup_datetime'),
                                  OneHotEncoder(handle_unknown='ignore'))

        preproc = make_column_transformer((pipe_distance, dist_cols),
                                          (pipe_time, time_cols))

        self.pipeline = make_pipeline(preproc, RandomForestRegressor())
Example #20
0
    def set_pipeline(self):
        memory = self.kwargs.get('pipeline_memory', None)
        dist = self.kwargs.get('distance_type', 'haversine')
        feateng_steps = self.kwargs.get('feateng',
                                        ['distance', 'time_features'])

        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(
            DistanceTransformer(distance_type=dist, **DIST_ARGS),
            StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), StandardScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(),
                                                StandardScaler())

        # Combine pipes
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            #('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center,
             list(DIST_ARGS.values())),
        ]

        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())],
                                 memory=memory)

        if self.optimize:
            self.pipeline.steps.insert(
                -1,
                ['optimize_size', OptimizeSize(verbose=False)])
Example #21
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     pipe_distance = make_pipeline(DistanceTransformer(), StandardScaler())
     pipe_time = make_pipeline(
         TimeFeaturesEncoder(time_column='pickup_datetime'),
         OneHotEncoder())
     dist_cols = [
         'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
         'dropoff_longitude'
     ]
     time_cols = ['pickup_datetime']
     feat_eng_bloc = ColumnTransformer([('distance', pipe_distance,
                                         dist_cols),
                                        ('time', pipe_time, time_cols)])
     self.pipeline = Pipeline(
         steps=[('feat_eng_bloc',
                 feat_eng_bloc), ('regressor', LinearRegression())])
Example #22
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())
     pipe_time = make_pipeline(
         TimeFeaturesEncoder(time_column='pickup_datetime'),
         OneHotEncoder(handle_unknown='ignore'))
     dist_cols = [
         'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
         'dropoff_longitude'
     ]
     time_cols = ['pickup_datetime']
     feat_eng_bloc = ColumnTransformer([('time', pipe_time, time_cols),
                                        ('distance', pipe_distance,
                                         dist_cols)])
     self.pipeline = Pipeline(
         steps=[('feat_eng_bloc',
                 feat_eng_bloc), ('regressor', RandomForestRegressor())])
     return self.pipeline
Example #23
0
 def set_pipeline(self):
     """defines the pipeline as a class attribute"""
     distance_pipe = Pipeline([
         ('distance',DistanceTransformer()),
         ('scaler', StandardScaler())
         ])
     time_pipe = Pipeline([
         ('timefeatures', TimeFeaturesEncoder("pickup_datetime")),
         ('encoding', OneHotEncoder(handle_unknown='ignore'))
         ])
     preproc = ColumnTransformer([
         ('distance', distance_pipe, ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']),
         ('time', time_pipe,['pickup_datetime'])
         ])
     self.pipeline = Pipeline([
             ('preproc',preproc),
             ('KNN',KNeighborsRegressor())
             ])
Example #24
0
    def set_pipeline(self):
        """set pipeline"""
        # create distance pipeline
        dist_pipe = make_pipeline(DistanceTransformer(), StandardScaler())
        # create distance pipeline
        time_pipe = make_pipeline(TimeFeaturesEncoder('pickup_datetime'),
                                  OneHotEncoder(handle_unknown='ignore'))

        # create preprocessing pipeline
        prepro_pipe = ColumnTransformer([('distance', dist_pipe, [
            'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
            'dropoff_latitude'
        ]), ('time', time_pipe, ['pickup_datetime'])])
        # display preprocessing pipeline
        prepro_pipe
        # Add the model of your choice to the pipeline
        from sklearn.linear_model import SGDRegressor
        self.pipeline = Pipeline([('preprocessing', prepro_pipe),
                                  ('sgd', SGDRegressor())])
Example #25
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        pipe_distance = make_pipeline(DistanceTransformer(), RobustScaler())
        pipe_time = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder())
        time_cols = ['pickup_datetime']
        dist_cols = [
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
            'dropoff_longitude'
        ]
        preprocessing = ColumnTransformer([('time', pipe_time, time_cols),
                                           ('distance', pipe_distance,
                                            dist_cols)])

        pipe_cols = Pipeline(
            steps=[('preprocessing',
                    preprocessing), ('regressor', LinearRegression())])
        return pipe_cols
Example #26
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler())

        time_pipe = make_pipeline(
            TimeFeaturesEncoder(time_column = 'pickup_datetime'),
            OneHotEncoder(handle_unknown = 'ignore')
            )

        preprocessor = ColumnTransformer([
            ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']),
            ('time_trans', time_pipe, ['pickup_datetime'])])

        model_pipeline = Pipeline(steps = [('preprocessing', preprocessor),
                                            ('regressor', LinearRegression())])

        self.pipeline = model_pipeline

        return self
Example #27
0
    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get("feateng",
                                        ["distance", "time_features"])
        if memory:
            memory = mkdtemp()
        time_pipe = Pipeline([
            ("time_enc", TimeFeaturesEncoder("pickup_datetime")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ])
        dist_pipe = Pipeline([
            ("dist_trans", DistanceTransformer(distance_type=dist,
                                               **DIST_ARGS)),
            ("stdscaler", StandardScaler()),
        ])
        center_pipe = Pipeline([("distance_center", DistanceToCenter()),
                                ("stdscaler", StandardScaler())])
        geohash_pipe = Pipeline([("deohash_add", AddGeohash()),
                                 ("hash_encode", ce.HashingEncoder())])
        direction_pipe = Pipeline([("direction_add", Direction()),
                                   ("stdscaler", StandardScaler())])
        feateng_blocks = [
            ("distance", dist_pipe, list(DIST_ARGS.values())),
            ("time_features", time_pipe, ["pickup_datetime"]),
            #("geohash", geohash_pipe, list(DIST_ARGS.values())),
            ("direction", direction_pipe, list(DIST_ARGS.values())),
            ("distance_to_center", center_pipe, list(DIST_ARGS.values())),
        ]
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")
        self.pipeline = Pipeline(
            steps=[("features", features_encoder),
                   ("df_clener", DataframeCleaner(verbose=False)),
                   ("rgs", self.get_estimator())],
            memory=memory,
        )
Example #28
0
    def set_pipeline(self):

        time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                      OneHotEncoder(handle_unknown='ignore'))

        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())

        dist_args = dict(start_lat="pickup_latitude",
                         start_lon="pickup_longitude",
                         end_lat="dropoff_latitude",
                         end_lon="dropoff_longitude")

        features_encoder = ColumnTransformer([
            ('distance', DistanceTransformer(**dist_args), list(dist_args.values())),
            ('time_features', time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(dist_args.values()))
        ])

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())])
Example #29
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        pipe_distance = Pipeline(steps=[(
            'distance_transformer',
            DistanceTransformer()), ('distance_scaling', StandardScaler())])
        pipe_time = Pipeline([
            ('time_transformer', TimeFeaturesEncoder("pickup_datetime")),
            ('time_encode', OneHotEncoder(handle_unknown='ignore',
                                          sparse=False))
        ])
        pipe_passengers = Pipeline([('passenger_scaler', RobustScaler())])

        distance_columns = [
            "pickup_latitude", "pickup_longitude", "dropoff_latitude",
            "dropoff_longitude"
        ]
        time_columns = ['pickup_datetime']
        passenger_columns = ['passenger_count']

        preproc_pipe = ColumnTransformer(
            [('distance', pipe_distance, distance_columns),
             ('time', pipe_time, time_columns),
             ('passenger', pipe_passengers, passenger_columns)],
            remainder='drop')
        if self.estimator == 'Lasso':
            self.pipeline = Pipeline([('transformer', preproc_pipe),
                                      ('regressor', LassoCV(cv=5,
                                                            n_alphas=5))])
        elif self.estimator == 'XGBoost':
            self.pipeline = Pipeline([('transformer', preproc_pipe),
                                      ('regressor',
                                       XGBRegressor(n_estimators=300,
                                                    learning_rate=0.05))])
        else:
            self.estimator == 'XGBoost'
            self.pipeline = Pipeline([('transformer', preproc_pipe),
                                      ('regressor',
                                       XGBRegressor(n_estimators=300,
                                                    learning_rate=0.05))])
Example #30
0
    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        # distance
        pipe_dist = Pipeline([
            ('distance', DistanceTransformer()),
            ('scaler', StandardScaler())])

        # time
        pipe_time = Pipeline([
            ('features', TimeFeaturesEncoder('pickup_datetime')),
            ('OneHot', OneHotEncoder())])

        # preproc
        dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
        time_cols = ['pickup_datetime']

        preprocess_pipe = ColumnTransformer([
            ('dist', pipe_dist, dist_cols),
            ('time', pipe_time, time_cols)])

        # model pipeline
        self.pipeline = Pipeline([
            ('preprocessing', preprocess_pipe),
            ('regression', LinearRegression())])