Ejemplo n.º 1
0
 def _make_pipeline(self, analysis, feature_transformers, model,
                    remote_dir):
     metric = self.metric
     mode = Evaluator.get_metric_mode(metric)
     best_config = analysis.get_best_config(metric=metric, mode=mode)
     best_logdir = analysis.get_best_logdir(metric=metric, mode=mode)
     print("best log dir is ", best_logdir)
     dataframe = analysis.dataframe(metric=metric, mode=mode)
     # print(dataframe)
     model_path = os.path.join(best_logdir, dataframe["checkpoint"].iloc[0])
     config = convert_bayes_configs(best_config).copy()
     self._print_config(config)
     if remote_dir is not None:
         all_config = restore_hdfs(model_path,
                                   remote_dir,
                                   feature_transformers,
                                   model,
                                   # config)
                                   )
     else:
         all_config = restore_zip(model_path,
                                  feature_transformers,
                                  model,
                                  # config)
                                  )
     return TimeSequencePipeline(name=self.name,
                                 feature_transformers=feature_transformers,
                                 model=model,
                                 config=all_config)
Ejemplo n.º 2
0
        def train_func(config):
            # make a copy from global variables for trial to make changes
            global_ft = ray.get(ft_id)
            trial_ft = deepcopy(global_ft)
            if isinstance(model_create_func, ModelBuilder):
                trial_model = model_create_func.build(config)
            else:
                trial_model = model_create_func()

            imputer = None
            if "imputation" in config:
                if config["imputation"] == "LastFillImpute":
                    imputer = LastFillImpute()
                elif config["imputation"] == "FillZeroImpute":
                    imputer = FillZeroImpute()

            # handling input
            global_input_df = ray.get(input_df_id)
            trial_input_df = deepcopy(global_input_df)
            if imputer:
                trial_input_df = imputer.impute(trial_input_df)
            config = convert_bayes_configs(config).copy()
            # print("config is ", config)
            (x_train,
             y_train) = trial_ft.fit_transform(trial_input_df, **config)
            # trial_ft.fit(trial_input_df, **config)

            # handling validation data
            validation_data = None
            if is_val_df_valid:
                global_validation_df = ray.get(validation_df_id)
                trial_validation_df = deepcopy(global_validation_df)
                validation_data = trial_ft.transform(trial_validation_df)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward_m = None
            # print("config:", config)
            for i in range(1, 101):
                result = trial_model.fit_eval(
                    x_train,
                    y_train,
                    validation_data=validation_data,
                    mc=mc,
                    metric=metric,
                    # verbose=1,
                    **config)
                reward_m = result if Evaluator.get_metric_mode(
                    metric) == "max" else -result
                ckpt_name = "best.ckpt"
                if best_reward_m is None or reward_m > best_reward_m:
                    best_reward_m = reward_m
                    save_zip(ckpt_name, trial_ft, trial_model, config)
                    if remote_dir is not None:
                        upload_ppl_hdfs(remote_dir, ckpt_name)

                tune.track.log(training_iteration=i,
                               reward_metric=reward_m,
                               checkpoint="best.ckpt")
Ejemplo n.º 3
0
    def _detach_recipe(self, recipe):
        self.search_space = recipe.search_space()

        stop = recipe.runtime_params()
        self.metric_threshold = None
        if "reward_metric" in stop.keys():
            self.mode = Evaluator.get_metric_mode(self.metric)
            self.metric_threshold = -stop["reward_metric"] if \
                self.mode == "min" else stop["reward_metric"]
        self.epochs = stop["training_iteration"]
        self.num_samples = stop["num_samples"]
Ejemplo n.º 4
0
 def _validate_metric_mode(metric, mode):
     from zoo.automl.common.metrics import Evaluator
     if not mode:
         try:
             mode = Evaluator.get_metric_mode(metric)
         except ValueError:
             pass
     if not mode:
         raise ValueError(f"We cannot infer metric mode with metric name of {metric}. "
                          f"Please specify the `metric_mode` parameter in AutoEstimator.fit().")
     if mode not in ["min", "max"]:
         raise ValueError("`mode` has to be one of ['min', 'max']")
     return mode
Ejemplo n.º 5
0
 def _train(self):
     # print("self.config in train is ", self.config)
     result = self.trial_model.fit_eval(self.x_train, self.y_train,
                                        validation_data=self.validation_data,
                                        # verbose=1,
                                        **self.config)
     self.reward_m = result if Evaluator.get_metric_mode(metric) == "max" else -result
     # if metric == "mean_squared_error":
     #     self.reward_m = (-1) * result
     #     # print("running iteration: ",i)
     # elif metric == "r_square":
     #     self.reward_m = result
     # else:
     #     raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"")
     return {"reward_metric": self.reward_m, "checkpoint": self.ckpt_name}
Ejemplo n.º 6
0
 def _validate_metric_mode(metric, mode):
     if not mode:
         if callable(metric):
             raise ValueError("You must specify `metric_mode` for your metric function")
         try:
             from zoo.automl.common.metrics import Evaluator
             mode = Evaluator.get_metric_mode(metric)
         except ValueError:
             pass
         if not mode:
             raise ValueError(f"We cannot infer metric mode with metric name of {metric}. Please"
                              f" specify the `metric_mode` parameter in AutoEstimator.fit().")
     if mode not in ["min", "max"]:
         raise ValueError("`mode` has to be one of ['min', 'max']")
     return mode
        def train_func(config):
            train_data = ray.get(data_id)
            val_data = ray.get(validation_data_id)
            config = convert_bayes_configs(config).copy()
            if not isinstance(model_builder, ModelBuilder):
                raise ValueError(f"You must input a ModelBuilder instance for model_builder")
            trial_model = model_builder.build(config)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward = None
            for i in range(1, 101):
                result = trial_model.fit_eval(data=train_data,
                                              validation_data=val_data,
                                              mc=mc,
                                              metric=metric,
                                              **config)
                reward = result
                checkpoint_filename = "best.ckpt"

                # Save best reward iteration
                mode = Evaluator.get_metric_mode(metric)
                if mode == "max":
                    has_best_reward = best_reward is None or reward > best_reward
                else:
                    has_best_reward = best_reward is None or reward < best_reward

                if has_best_reward:
                    best_reward = reward
                    trial_model.save(checkpoint_filename)
                    # Save to hdfs
                    if remote_dir is not None:
                        put_ckpt_hdfs(remote_dir, checkpoint_filename)

                report_dict = {"training_iteration": i,
                               metric: reward,
                               "checkpoint": checkpoint_filename,
                               "best_" + metric: best_reward}
                tune.report(**report_dict)
    def compile(self,
                data,
                model_builder,
                epochs=1,
                validation_data=None,
                metric="mse",
                metric_threshold=None,
                n_sampling=1,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                mc=False):
        """
        Do necessary preparations for the engine
        :param data: data for training
               Pandas Dataframe:
                   a Pandas dataframe for training
               Numpy ndarray:
                   a tuple in form of (x, y)
                        x: ndarray for training input
                        y: ndarray for training output
        :param model_builder: model creation function
        :param search_space: a dict for search space
        :param metric_threshold: a trial will be terminated when metric threshold is met
        :param n_sampling: number of sampling
        :param epochs: max epochs for training
        :param validation_data: data for validation
               Pandas Dataframe:
                   a Pandas dataframe for validation
               Numpy ndarray:
                   a tuple in form of (x, y)
                        x: ndarray for validation input
                        y: ndarray for validation output
        :param search_space: search_space
        :param search_alg: str, all supported searcher provided by ray tune
               (i.e."variant_generator", "random", "ax", "dragonfly", "skopt",
               "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and
               "sigopt")
        :param search_alg_params: extra parameters for searcher algorithm
        :param scheduler: str, all supported scheduler provided by ray tune
        :param scheduler_params: parameters for scheduler
        :param mc: if calculate uncertainty
        :param metric: metric name
        """

        # metric and metric's mode
        self.metric = metric
        self.mode = Evaluator.get_metric_mode(metric)
        self.num_samples = n_sampling
        self.stopper = TrialStopper(metric_threshold=metric_threshold,
                                    epochs=epochs,
                                    metric=self.metric,
                                    mode=self.mode)

        self.search_space = search_space

        self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params,
                                                               self.metric, self.mode)
        self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params,
                                                             self.metric, self.mode)

        self.train_func = self._prepare_train_func(data=data,
                                                   model_builder=model_builder,
                                                   validation_data=validation_data,
                                                   metric=metric,
                                                   mc=mc,
                                                   remote_dir=self.remote_dir
                                                   )
    def compile(self,
                data,
                model_create_func,
                recipe,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param data: data dictionary
               Pandas Dataframe API keys:
                   "df": dataframe for training
                   "val_df": (optional) dataframe for validation
                   "feature_cols": (optional) column name for extra features
                   "target_col": (optional) column name for target
               Numpy ndarray API keys:
                   "x": ndarray for training input
                   "y": ndarray for training output
                   "x_val": (optional) ndarray for validation input
                   "y_val": (optional) ndarray for validation output
               Note: For Pandas Dataframe API keys, if "feature_cols" or "target_col" is missing,
                     then feature_transformers is required.
        :param model_create_func: model creation function
        :param recipe: search recipe
        :param search_space: search_space, required if recipe is not provided
        :param search_alg: str, one of "skopt", "bayesopt" and "sigopt"
        :param search_alg_params: extra parameters for searcher algorithm
        :param scheduler: str, all supported scheduler provided by ray tune
        :param scheduler_params: parameters for scheduler
        :param feature_transformers: feature transformer instance
        :param mc: if calculate uncertainty
        :param metric: metric name
        """

        # data mode detection
        assert isinstance(data, dict), 'ERROR: Argument \'data\' should be a dictionary.'
        data_mode = None  # data_mode can only be 'dataframe' or 'ndarray'
        data_schema = set(data.keys())
        if set(["df"]).issubset(data_schema):
            data_mode = 'dataframe'
        if set(["x", "y"]).issubset(data_schema):
            data_mode = 'ndarray'
        assert data_mode in ['dataframe', 'ndarray'],\
            'ERROR: Argument \'data\' should fit either \
                dataframe schema (include \'df\' in keys) or\
                     ndarray (include \'x\' and \'y\' in keys) schema.'

        # data extract
        if data_mode == 'dataframe':
            input_data = data['df']
            feature_cols = data.get("feature_cols", None)
            target_col = data.get("target_col", None)
            validation_data = data.get("val_df", None)
        else:
            input_data = {"x": data["x"], "y": data["y"]}
            if 'val_x' in data.keys():
                validation_data = {"x": data["val_x"], "y": data["val_y"]}
            else:
                validation_data = None

        # metric and metric's mode
        self.metric = metric
        self.mode = Evaluator.get_metric_mode(metric)

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']

        # temp operation for reward_metric
        redundant_stop_keys = stop.keys() - {"reward_metric", "training_iteration"}
        assert len(redundant_stop_keys) == 0, \
            f"{redundant_stop_keys} is not expected in stop criteria, \
             only \"reward_metric\", \"training_iteration\" are expected."

        if "reward_metric" in stop.keys():
            stop[self.metric] = -stop["reward_metric"] if \
                self.mode == "min" else stop["reward_metric"]
            del stop["reward_metric"]
        stop.setdefault("training_iteration", 1)

        self.stopper = TrialStopper(stop=stop, metric=self.metric, mode=self.mode)

        if search_space is None:
            search_space = recipe.search_space()
        self.search_space = search_space

        self._search_alg = RayTuneSearchEngine._set_search_alg(search_alg, search_alg_params,
                                                               recipe, self.metric, self.mode)
        self._scheduler = RayTuneSearchEngine._set_scheduler(scheduler, scheduler_params,
                                                             self.metric, self.mode)

        if feature_transformers is None and data_mode == 'dataframe':
            feature_transformers = IdentityTransformer(feature_cols, target_col)

        numpy_format = True if data_mode == 'ndarray' else False

        self.train_func = self._prepare_train_func(input_data=input_data,
                                                   model_create_func=model_create_func,
                                                   feature_transformers=feature_transformers,
                                                   validation_data=validation_data,
                                                   metric=metric,
                                                   mc=mc,
                                                   remote_dir=self.remote_dir,
                                                   numpy_format=numpy_format
                                                   )
        def train_func(config):
            numpy_format = ray.get(numpy_format_id)

            if isinstance(model_create_func, ModelBuilder):
                trial_model = model_create_func.build(config)
            else:
                trial_model = model_create_func()

            if not numpy_format:
                global_ft = ray.get(ft_id)
                trial_ft = deepcopy(global_ft)
                imputer = None
                if "imputation" in config:
                    if config["imputation"] == "LastFillImpute":
                        imputer = LastFillImpute()
                    elif config["imputation"] == "FillZeroImpute":
                        imputer = FillZeroImpute()

                # handling input
                global_input_df = ray.get(input_data_id)
                trial_input_df = deepcopy(global_input_df)
                if imputer:
                    trial_input_df = imputer.impute(trial_input_df)
                config = convert_bayes_configs(config).copy()
                (x_train, y_train) = trial_ft.fit_transform(trial_input_df, **config)

                # handling validation data
                validation_data = None
                if is_val_valid:
                    global_validation_df = ray.get(validation_data_id)
                    trial_validation_df = deepcopy(global_validation_df)
                    validation_data = trial_ft.transform(trial_validation_df)
            else:
                train_data = ray.get(input_data_id)
                x_train, y_train = (train_data["x"], train_data["y"])
                validation_data = None
                if is_val_valid:
                    validation_data = ray.get(validation_data_id)
                    validation_data = (validation_data["x"], validation_data["y"])
                trial_ft = None

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward = None
            for i in range(1, 101):
                result = trial_model.fit_eval(x_train,
                                              y_train,
                                              validation_data=validation_data,
                                              mc=mc,
                                              metric=metric,
                                              # verbose=1,
                                              **config)
                reward = result
                checkpoint_filename = "best.ckpt"

                # Save best reward iteration
                mode = Evaluator.get_metric_mode(metric)
                if mode == "max":
                    has_best_reward = best_reward is None or reward > best_reward
                else:
                    has_best_reward = best_reward is None or reward < best_reward

                if has_best_reward:
                    best_reward = reward
                    if isinstance(model_create_func, ModelBuilder):
                        trial_model.save(checkpoint_filename)
                    else:
                        save_zip(checkpoint_filename, trial_ft, trial_model, config)
                    # Save to hdfs
                    if remote_dir is not None:
                        upload_ppl_hdfs(remote_dir, checkpoint_filename)

                report_dict = {"training_iteration": i,
                               metric: reward,
                               "checkpoint": checkpoint_filename,
                               "best_" + metric: best_reward}
                tune.report(**report_dict)
    def compile(self,
                data,
                model_create_func,
                recipe,
                validation_data=None,
                search_space=None,
                search_alg=None,
                search_alg_params=None,
                scheduler=None,
                scheduler_params=None,
                feature_transformers=None,
                mc=False,
                metric="mse"):
        """
        Do necessary preparations for the engine
        :param data: data for training
               Pandas Dataframe:
                   a Pandas dataframe for training
               Numpy ndarray:
                   a tuple in form of (x, y)
                        x: ndarray for training input
                        y: ndarray for training output
        :param model_create_func: model creation function
        :param recipe: search recipe
        :param validation_data: data for validation
               Pandas Dataframe:
                   a Pandas dataframe for validation
               Numpy ndarray:
                   a tuple in form of (x, y)
                        x: ndarray for validation input
                        y: ndarray for validation output
        :param search_space: search_space, required if recipe is not provided
        :param search_alg: str, all supported searcher provided by ray tune
               (i.e."variant_generator", "random", "ax", "dragonfly", "skopt",
               "hyperopt", "bayesopt", "bohb", "nevergrad", "optuna", "zoopt" and
               "sigopt")
        :param search_alg_params: extra parameters for searcher algorithm
        :param scheduler: str, all supported scheduler provided by ray tune
        :param scheduler_params: parameters for scheduler
        :param feature_transformers: feature transformer instance
        :param mc: if calculate uncertainty
        :param metric: metric name
        """

        # metric and metric's mode
        self.metric = metric
        self.mode = Evaluator.get_metric_mode(metric)

        # prepare parameters for search engine
        runtime_params = recipe.runtime_params()
        self.num_samples = runtime_params['num_samples']
        stop = dict(runtime_params)
        del stop['num_samples']

        # temp operation for reward_metric
        redundant_stop_keys = stop.keys() - {
            "reward_metric", "training_iteration"
        }
        assert len(redundant_stop_keys) == 0, \
            f"{redundant_stop_keys} is not expected in stop criteria, \
             only \"reward_metric\", \"training_iteration\" are expected."

        if "reward_metric" in stop.keys():
            stop[self.metric] = -stop["reward_metric"] if \
                self.mode == "min" else stop["reward_metric"]
            del stop["reward_metric"]
        stop.setdefault("training_iteration", 1)

        self.stopper = TrialStopper(stop=stop,
                                    metric=self.metric,
                                    mode=self.mode)

        if search_space is None:
            search_space = recipe.search_space()
        self.search_space = search_space

        self._search_alg = RayTuneSearchEngine._set_search_alg(
            search_alg, search_alg_params, recipe, self.metric, self.mode)
        self._scheduler = RayTuneSearchEngine._set_scheduler(
            scheduler, scheduler_params, self.metric, self.mode)

        self.train_func = self._prepare_train_func(
            data=data,
            model_create_func=model_create_func,
            feature_transformers=feature_transformers,
            validation_data=validation_data,
            metric=metric,
            mc=mc,
            remote_dir=self.remote_dir)