def setup_method(self, method): # super().setup_method(method) self.model = XGBoost(config={ 'n_estimators': 5, 'max_depth': 2, 'tree_method': 'hist' }) feature_cols = ["f", "f2"] target_col = "t" train_df = pd.DataFrame({ "f": np.random.randn(20), "f2": np.random.randn(20), "t": np.random.randint(20) }) val_df = pd.DataFrame({ "f": np.random.randn(5), "f2": np.random.randn(5), "t": np.random.randint(5) }) ft = IdentityTransformer(feature_cols=feature_cols, target_col=target_col) self.x, self.y = ft.transform(train_df) self.val_x, self.val_y = ft.transform(val_df)
def load_xgboost_pipeline(file, model_type="regressor"): from zoo.zouwu.feature.identity_transformer import IdentityTransformer feature_transformers = IdentityTransformer() model = XGBoost(model_type=model_type) all_config = restore_zip(file, feature_transformers, model) ts_pipeline = TimeSequencePipeline( feature_transformers=feature_transformers, model=model, config=all_config) print("Restore pipeline from", file) return ts_pipeline
def compile(self, data, model_create_func, recipe, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param input_df: :param search_space: :param num_samples: :param stop: :param search_algorithm: :param search_algorithm_params: :param fixed_params: :param feature_transformers: :param model: :param validation_df: :param metric: :return: """ # data mode detection assert isinstance( data, dict), 'ERROR: Argument \'data\' should be a dictionary.' data_mode = None # data_mode can only be 'dataframe' or 'ndarray' data_schema = set(data.keys()) if set(["df"]).issubset(data_schema): data_mode = 'dataframe' if set(["x", "y"]).issubset(data_schema): data_mode = 'ndarray' assert data_mode in ['dataframe', 'ndarray'],\ 'ERROR: Argument \'data\' should fit either \ dataframe schema (include \'df\' in keys) or\ ndarray (include \'x\' and \'y\' in keys) schema.' # data extract if data_mode == 'dataframe': input_df = data['df'] feature_cols = data.get("feature_cols", None) target_col = data.get("target_col", None) validation_df = data.get("val_df", None) else: if data["x"].ndim == 1: data["x"] = data["x"].reshape(-1, 1) if data["y"].ndim == 1: data["y"] = data["y"].reshape(-1, 1) if "val_x" in data.keys() and data["val_x"].ndim == 1: data["val_x"] = data["val_x"].reshape(-1, 1) if "val_y" in data.keys() and data["val_y"].ndim == 1: data["val_y"] = data["val_y"].reshape(-1, 1) input_data = {"x": data["x"], "y": data["y"]} if 'val_x' in data.keys(): validation_data = {"x": data["val_x"], "y": data["val_y"]} else: validation_data = None # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] self.stop_criteria = stop if search_space is None: search_space = recipe.search_space(all_available_features=None) self._search_alg = RayTuneSearchEngine._set_search_alg( search_alg, search_alg_params, recipe, search_space) self._scheduler = RayTuneSearchEngine._set_scheduler( scheduler, scheduler_params) self.search_space = self._prepare_tune_config(search_space) if feature_transformers is None and data_mode == 'dataframe': feature_transformers = IdentityTransformer(feature_cols, target_col) if data_mode == 'dataframe': self.train_func = self._prepare_train_func( input_data=input_df, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_data=validation_df, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=False) else: self.train_func = self._prepare_train_func( input_data=input_data, model_create_func=model_create_func, feature_transformers=None, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=True)
def create_feature_transformer(self): ft = IdentityTransformer(self.feature_cols, self.target_col) return ft
def compile(self, data, model_create_func, recipe, search_space=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, feature_transformers=None, mc=False, metric="mse"): """ Do necessary preparations for the engine :param data: data dictionary Pandas Dataframe API keys: "df": dataframe for training "val_df": (optional) dataframe for validation "feature_cols": (optional) column name for extra features "target_col": (optional) column name for target Numpy ndarray API keys: "x": ndarray for training input "y": ndarray for training output "x_val": (optional) ndarray for validation input "y_val": (optional) ndarray for validation output Note: For Pandas Dataframe API keys, if "feature_cols" or "target_col" is missing, then feature_transformers is required. :param model_create_func: model creation function :param recipe: search recipe :param search_space: search_space, required if recipe is not provided :param search_alg: str, one of "skopt", "bayesopt" and "sigopt" :param search_alg_params: extra parameters for searcher algorithm :param scheduler: str, all supported scheduler provided by ray tune :param scheduler_params: parameters for scheduler :param feature_transformers: feature transformer instance :param mc: if calculate uncertainty :param metric: metric name """ # data mode detection assert isinstance(data, dict), 'ERROR: Argument \'data\' should be a dictionary.' data_mode = None # data_mode can only be 'dataframe' or 'ndarray' data_schema = set(data.keys()) if set(["df"]).issubset(data_schema): data_mode = 'dataframe' if set(["x", "y"]).issubset(data_schema): data_mode = 'ndarray' assert data_mode in ['dataframe', 'ndarray'],\ 'ERROR: Argument \'data\' should fit either \ dataframe schema (include \'df\' in keys) or\ ndarray (include \'x\' and \'y\' in keys) schema.' # data extract if data_mode == 'dataframe': input_data = data['df'] feature_cols = data.get("feature_cols", None) target_col = data.get("target_col", None) validation_data = data.get("val_df", None) else: input_data = {"x": data["x"], "y": data["y"]} if 'val_x' in data.keys(): validation_data = {"x": data["val_x"], "y": data["val_y"]} else: validation_data = None # metric and metric's mode self.metric = metric self.mode = Evaluator.get_metric_mode(metric) # prepare parameters for search engine runtime_params = recipe.runtime_params() self.num_samples = runtime_params['num_samples'] stop = dict(runtime_params) del stop['num_samples'] # temp operation for reward_metric redundant_stop_keys = stop.keys() - {"reward_metric", "training_iteration"} assert len(redundant_stop_keys) == 0, \ f"{redundant_stop_keys} is not expected in stop criteria, \ only \"reward_metric\", \"training_iteration\" are expected." if "reward_metric" in stop.keys(): stop[self.metric] = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] del stop["reward_metric"] stop.setdefault("training_iteration", 1) self.stopper = TrialStopper(stop=stop, metric=self.metric, mode=self.mode) if search_space is None: search_space = recipe.search_space() self.search_space = search_space self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params, recipe, self.metric, self.mode) self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params, self.metric, self.mode) if feature_transformers is None and data_mode == 'dataframe': feature_transformers = IdentityTransformer(feature_cols, target_col) numpy_format = True if data_mode == 'ndarray' else False self.train_func = self._prepare_train_func(input_data=input_data, model_create_func=model_create_func, feature_transformers=feature_transformers, validation_data=validation_data, metric=metric, mc=mc, remote_dir=self.remote_dir, numpy_format=numpy_format )