def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler()) # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())], memory=memory)
def set_pipeline(self): dist = self.kwargs.get("distance_type", "haversine") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features"]) # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) # Add new feature engineer Above #pipe_direction = #pipe_distance_to_center = # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), #('direction', pipe_direction, list(DIST_ARGS.values())), #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], )
def set_pipeline(self): time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) features_encoder = ColumnTransformer([ ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())), ('time_features', time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())) ]) self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())])
def set_pipeline(self): memory = self.kwargs.get('pipeline_memory', None) dist = self.kwargs.get('distance_type', 'haversine') feateng_steps = self.kwargs.get('feateng', ['distance', 'time_features']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), StandardScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), StandardScaler()) # Combine pipes feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), #('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)])
def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get("feateng", ["distance", "time_features"]) if memory: memory = mkdtemp() time_pipe = Pipeline([ ("time_enc", TimeFeaturesEncoder("pickup_datetime")), ("ohe", OneHotEncoder(handle_unknown="ignore")), ]) dist_pipe = Pipeline([ ("dist_trans", DistanceTransformer(distance_type=dist, **DIST_ARGS)), ("stdscaler", StandardScaler()), ]) center_pipe = Pipeline([("distance_center", DistanceToCenter()), ("stdscaler", StandardScaler())]) geohash_pipe = Pipeline([("deohash_add", AddGeohash()), ("hash_encode", ce.HashingEncoder())]) direction_pipe = Pipeline([("direction_add", Direction()), ("stdscaler", StandardScaler())]) feateng_blocks = [ ("distance", dist_pipe, list(DIST_ARGS.values())), ("time_features", time_pipe, ["pickup_datetime"]), #("geohash", geohash_pipe, list(DIST_ARGS.values())), ("direction", direction_pipe, list(DIST_ARGS.values())), ("distance_to_center", center_pipe, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline( steps=[("features", features_encoder), ("df_clener", DataframeCleaner(verbose=False)), ("rgs", self.get_estimator())], memory=memory, )
def set_pipeline(self): time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) dist_args = dict(start_lat="pickup_latitude", start_lon="pickup_longitude", end_lat="dropoff_latitude", end_lon="dropoff_longitude") features_encoder = ColumnTransformer([ ('distance', DistanceTransformer(**dist_args), list(dist_args.values())), ('time_features', time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(dist_args.values())) ]) self.pipeline = Pipeline(steps=[ ('features', features_encoder), ('rgs', self.get_estimator())])