def set_pipeline(self): memory = self.kwargs.get("pipeline_memory", None) dist = self.kwargs.get("distance_type", "euclidian") feateng_steps = self.kwargs.get( "feateng", ["distance", "time_features", 'direction', 'distance_to_center']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), RobustScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler()) # Define default feature engineering blocs feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), ('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] # Filter out some bocks according to input parameters for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)])
def set_pipeline(self): memory = self.kwargs.get('pipeline_memory', None) dist = self.kwargs.get('distance_type', 'haversine') feateng_steps = self.kwargs.get('feateng', ['distance', 'time_features']) if memory: memory = mkdtemp() # Define feature engineering pipeline blocks here pipe_time_features = make_pipeline( TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder(handle_unknown='ignore')) pipe_distance = make_pipeline( DistanceTransformer(distance_type=dist, **DIST_ARGS), StandardScaler()) pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder()) pipe_direction = make_pipeline(Direction(), StandardScaler()) pipe_distance_to_center = make_pipeline(DistanceToCenter(), StandardScaler()) # Combine pipes feateng_blocks = [ ('distance', pipe_distance, list(DIST_ARGS.values())), ('time_features', pipe_time_features, ['pickup_datetime']), #('geohash', pipe_geohash, list(DIST_ARGS.values())), ('direction', pipe_direction, list(DIST_ARGS.values())), ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())), ] for bloc in feateng_blocks: if bloc[0] not in feateng_steps: feateng_blocks.remove(bloc) features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop") self.pipeline = Pipeline(steps=[('features', features_encoder), ('rgs', self.get_estimator())], memory=memory) if self.optimize: self.pipeline.steps.insert( -1, ['optimize_size', OptimizeSize(verbose=False)])