def setup_method(self, method):
        # super().setup_method(method)
        self.model = XGBoost(config={
            'n_estimators': 5,
            'max_depth': 2,
            'tree_method': 'hist'
        })
        feature_cols = ["f", "f2"]
        target_col = "t"
        train_df = pd.DataFrame({
            "f": np.random.randn(20),
            "f2": np.random.randn(20),
            "t": np.random.randint(20)
        })
        val_df = pd.DataFrame({
            "f": np.random.randn(5),
            "f2": np.random.randn(5),
            "t": np.random.randint(5)
        })

        ft = IdentityTransformer(feature_cols=feature_cols,
                                 target_col=target_col)

        self.x, self.y = ft.transform(train_df)
        self.val_x, self.val_y = ft.transform(val_df)
Exemple #2
0
def load_xgboost_pipeline(file, model_type="regressor"):
    from zoo.zouwu.feature.identity_transformer import IdentityTransformer
    feature_transformers = IdentityTransformer()
    model = XGBoost(model_type=model_type)

    all_config = restore_zip(file, feature_transformers, model)
    ts_pipeline = TimeSequencePipeline(
        feature_transformers=feature_transformers,
        model=model,
        config=all_config)
    print("Restore pipeline from", file)
    return ts_pipeline
Exemple #3
0
    def compile(self,
                data,
                model_create_func,
                recipe,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param input_df:
        :param search_space:
        :param num_samples:
        :param stop:
        :param search_algorithm:
        :param search_algorithm_params:
        :param fixed_params:
        :param feature_transformers:
        :param model:
        :param validation_df:
        :param metric:
        :return:
        """

        # data mode detection
        assert isinstance(
            data, dict), 'ERROR: Argument \'data\' should be a dictionary.'
        data_mode = None  # data_mode can only be 'dataframe' or 'ndarray'
        data_schema = set(data.keys())
        if set(["df"]).issubset(data_schema):
            data_mode = 'dataframe'
        if set(["x", "y"]).issubset(data_schema):
            data_mode = 'ndarray'
        assert data_mode in ['dataframe', 'ndarray'],\
            'ERROR: Argument \'data\' should fit either \
                dataframe schema (include \'df\' in keys) or\
                     ndarray (include \'x\' and \'y\' in keys) schema.'

        # data extract
        if data_mode == 'dataframe':
            input_df = data['df']
            feature_cols = data.get("feature_cols", None)
            target_col = data.get("target_col", None)
            validation_df = data.get("val_df", None)
        else:
            if data["x"].ndim == 1:
                data["x"] = data["x"].reshape(-1, 1)
            if data["y"].ndim == 1:
                data["y"] = data["y"].reshape(-1, 1)
            if "val_x" in data.keys() and data["val_x"].ndim == 1:
                data["val_x"] = data["val_x"].reshape(-1, 1)
            if "val_y" in data.keys() and data["val_y"].ndim == 1:
                data["val_y"] = data["val_y"].reshape(-1, 1)

            input_data = {"x": data["x"], "y": data["y"]}
            if 'val_x' in data.keys():
                validation_data = {"x": data["val_x"], "y": data["val_y"]}
            else:
                validation_data = None

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']
        self.stop_criteria = stop
        if search_space is None:
            search_space = recipe.search_space(all_available_features=None)
        self._search_alg = RayTuneSearchEngine._set_search_alg(
            search_alg, search_alg_params, recipe, search_space)
        self._scheduler = RayTuneSearchEngine._set_scheduler(
            scheduler, scheduler_params)
        self.search_space = self._prepare_tune_config(search_space)

        if feature_transformers is None and data_mode == 'dataframe':
            feature_transformers = IdentityTransformer(feature_cols,
                                                       target_col)

        if data_mode == 'dataframe':
            self.train_func = self._prepare_train_func(
                input_data=input_df,
                model_create_func=model_create_func,
                feature_transformers=feature_transformers,
                validation_data=validation_df,
                metric=metric,
                mc=mc,
                remote_dir=self.remote_dir,
                numpy_format=False)
        else:
            self.train_func = self._prepare_train_func(
                input_data=input_data,
                model_create_func=model_create_func,
                feature_transformers=None,
                validation_data=validation_data,
                metric=metric,
                mc=mc,
                remote_dir=self.remote_dir,
                numpy_format=True)
 def create_feature_transformer(self):
     ft = IdentityTransformer(self.feature_cols, self.target_col)
     return ft
    def compile(self,
                data,
                model_create_func,
                recipe,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param data: data dictionary
               Pandas Dataframe API keys:
                   "df": dataframe for training
                   "val_df": (optional) dataframe for validation
                   "feature_cols": (optional) column name for extra features
                   "target_col": (optional) column name for target
               Numpy ndarray API keys:
                   "x": ndarray for training input
                   "y": ndarray for training output
                   "x_val": (optional) ndarray for validation input
                   "y_val": (optional) ndarray for validation output
               Note: For Pandas Dataframe API keys, if "feature_cols" or "target_col" is missing,
                     then feature_transformers is required.
        :param model_create_func: model creation function
        :param recipe: search recipe
        :param search_space: search_space, required if recipe is not provided
        :param search_alg: str, one of "skopt", "bayesopt" and "sigopt"
        :param search_alg_params: extra parameters for searcher algorithm
        :param scheduler: str, all supported scheduler provided by ray tune
        :param scheduler_params: parameters for scheduler
        :param feature_transformers: feature transformer instance
        :param mc: if calculate uncertainty
        :param metric: metric name
        """

        # data mode detection
        assert isinstance(data, dict), 'ERROR: Argument \'data\' should be a dictionary.'
        data_mode = None  # data_mode can only be 'dataframe' or 'ndarray'
        data_schema = set(data.keys())
        if set(["df"]).issubset(data_schema):
            data_mode = 'dataframe'
        if set(["x", "y"]).issubset(data_schema):
            data_mode = 'ndarray'
        assert data_mode in ['dataframe', 'ndarray'],\
            'ERROR: Argument \'data\' should fit either \
                dataframe schema (include \'df\' in keys) or\
                     ndarray (include \'x\' and \'y\' in keys) schema.'

        # data extract
        if data_mode == 'dataframe':
            input_data = data['df']
            feature_cols = data.get("feature_cols", None)
            target_col = data.get("target_col", None)
            validation_data = data.get("val_df", None)
        else:
            input_data = {"x": data["x"], "y": data["y"]}
            if 'val_x' in data.keys():
                validation_data = {"x": data["val_x"], "y": data["val_y"]}
            else:
                validation_data = None

        # metric and metric's mode
        self.metric = metric
        self.mode = Evaluator.get_metric_mode(metric)

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']

        # temp operation for reward_metric
        redundant_stop_keys = stop.keys() - {"reward_metric", "training_iteration"}
        assert len(redundant_stop_keys) == 0, \
            f"{redundant_stop_keys} is not expected in stop criteria, \
             only \"reward_metric\", \"training_iteration\" are expected."

        if "reward_metric" in stop.keys():
            stop[self.metric] = -stop["reward_metric"] if \
                self.mode == "min" else stop["reward_metric"]
            del stop["reward_metric"]
        stop.setdefault("training_iteration", 1)

        self.stopper = TrialStopper(stop=stop, metric=self.metric, mode=self.mode)

        if search_space is None:
            search_space = recipe.search_space()
        self.search_space = search_space

        self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params,
                                                               recipe, self.metric, self.mode)
        self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params,
                                                             self.metric, self.mode)

        if feature_transformers is None and data_mode == 'dataframe':
            feature_transformers = IdentityTransformer(feature_cols, target_col)

        numpy_format = True if data_mode == 'ndarray' else False

        self.train_func = self._prepare_train_func(input_data=input_data,
                                                   model_create_func=model_create_func,
                                                   feature_transformers=feature_transformers,
                                                   validation_data=validation_data,
                                                   metric=metric,
                                                   mc=mc,
                                                   remote_dir=self.remote_dir,
                                                   numpy_format=numpy_format
                                                   )