Esempio n. 1
0
    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get("feateng", ["distance", "time_features", 'direction', 'distance_to_center'])
        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                           OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist, **DIST_ARGS), RobustScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), RobustScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(), RobustScaler())

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop")

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())], memory=memory)
Esempio n. 2
0
    def set_pipeline(self):
        dist = self.kwargs.get("distance_type", "haversine")
        feateng_steps = self.kwargs.get("feateng",
                                        ["distance", "time_features"])

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=dist),
                                      StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        # Add new feature engineer Above
        #pipe_direction =
        #pipe_distance_to_center =

        # Define default feature engineering blocs
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values())),
            #('direction', pipe_direction, list(DIST_ARGS.values())),
            #('distance_to_center', pipe_distance_to_center, list(DIST_ARGS.values())),
        ]
        # Filter out some bocks according to input parameters
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())], )
Esempio n. 3
0
    def set_pipeline(self):

        time_features = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'),
                                      OneHotEncoder(handle_unknown='ignore'))

        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())

        features_encoder = ColumnTransformer([
            ('distance', DistanceTransformer(**DIST_ARGS), list(DIST_ARGS.values())),
            ('time_features', time_features, ['pickup_datetime']),
            ('geohash', pipe_geohash, list(DIST_ARGS.values()))
        ])

        self.pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('rgs', self.get_estimator())])
Esempio n. 4
0
    def set_pipeline(self):
        memory = self.kwargs.get('pipeline_memory', None)
        dist = self.kwargs.get('distance_type', 'haversine')
        feateng_steps = self.kwargs.get('feateng',
                                        ['distance', 'time_features'])

        if memory:
            memory = mkdtemp()

        # Define feature engineering pipeline blocks here
        pipe_time_features = make_pipeline(
            TimeFeaturesEncoder(time_column='pickup_datetime'),
            OneHotEncoder(handle_unknown='ignore'))
        pipe_distance = make_pipeline(
            DistanceTransformer(distance_type=dist, **DIST_ARGS),
            StandardScaler())
        pipe_geohash = make_pipeline(AddGeohash(), ce.HashingEncoder())
        pipe_direction = make_pipeline(Direction(), StandardScaler())
        pipe_distance_to_center = make_pipeline(DistanceToCenter(),
                                                StandardScaler())

        # Combine pipes
        feateng_blocks = [
            ('distance', pipe_distance, list(DIST_ARGS.values())),
            ('time_features', pipe_time_features, ['pickup_datetime']),
            #('geohash', pipe_geohash, list(DIST_ARGS.values())),
            ('direction', pipe_direction, list(DIST_ARGS.values())),
            ('distance_to_center', pipe_distance_to_center,
             list(DIST_ARGS.values())),
        ]

        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")

        self.pipeline = Pipeline(steps=[('features', features_encoder),
                                        ('rgs', self.get_estimator())],
                                 memory=memory)

        if self.optimize:
            self.pipeline.steps.insert(
                -1,
                ['optimize_size', OptimizeSize(verbose=False)])
Esempio n. 5
0
    def set_pipeline(self):
        pipe_time_feature = make_pipeline(TimeFeaturesEncoder(time_column="pickup_datetime"), OneHotEncoder())
        pipe_distance = make_pipeline(DistanceTransformer(distance_type=self.distance_type, **DIST_ARGS ))
        pipe_distancejfk = make_pipeline(DistanceTojfk())

        features_encoder = ColumnTransformer(
                [
                    ("distance_feature", pipe_distance, list(DIST_ARGS.values())),
                    ("time_feature", pipe_time_feature, ["pickup_datetime"]),
                    ("distance_jfk", pipe_distancejfk, list(DIST_ARGS.values()))
                    ]
                )

        self.pipeline = Pipeline(
                steps = [
                    ("features_encoder", features_encoder),
                    ("model", self.get_estimator())
                    ]
                )
Esempio n. 6
0
    def set_pipeline(self):
        memory = self.kwargs.get("pipeline_memory", None)
        dist = self.kwargs.get("distance_type", "euclidian")
        feateng_steps = self.kwargs.get("feateng",
                                        ["distance", "time_features"])
        if memory:
            memory = mkdtemp()
        time_pipe = Pipeline([
            ("time_enc", TimeFeaturesEncoder("pickup_datetime")),
            ("ohe", OneHotEncoder(handle_unknown="ignore")),
        ])
        dist_pipe = Pipeline([
            ("dist_trans", DistanceTransformer(distance_type=dist,
                                               **DIST_ARGS)),
            ("stdscaler", StandardScaler()),
        ])
        center_pipe = Pipeline([("distance_center", DistanceToCenter()),
                                ("stdscaler", StandardScaler())])
        geohash_pipe = Pipeline([("deohash_add", AddGeohash()),
                                 ("hash_encode", ce.HashingEncoder())])
        direction_pipe = Pipeline([("direction_add", Direction()),
                                   ("stdscaler", StandardScaler())])
        feateng_blocks = [
            ("distance", dist_pipe, list(DIST_ARGS.values())),
            ("time_features", time_pipe, ["pickup_datetime"]),
            #("geohash", geohash_pipe, list(DIST_ARGS.values())),
            ("direction", direction_pipe, list(DIST_ARGS.values())),
            ("distance_to_center", center_pipe, list(DIST_ARGS.values())),
        ]
        for bloc in feateng_blocks:
            if bloc[0] not in feateng_steps:
                feateng_blocks.remove(bloc)

        features_encoder = ColumnTransformer(feateng_blocks,
                                             n_jobs=None,
                                             remainder="drop")
        self.pipeline = Pipeline(
            steps=[("features", features_encoder),
                   ("df_clener", DataframeCleaner(verbose=False)),
                   ("rgs", self.get_estimator())],
            memory=memory,
        )