def evaluate(self, x, y, metric=['mean_squared_error']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ e = Evaluator() y_pred = self.predict(x) return [e.evaluate(m, y, y_pred) for m in metric]
def evaluate(self, input_df, metric=None): """ Evaluate the model on a list of metrics. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param metric: A list of Strings Available string values are "mean_squared_error", "r_square". :return: a list of metric evaluation results. """ Evaluator.check_metric(metric) return self.pipeline.evaluate(input_df, metric)
def evaluate_with_onnx(self, data, metrics=['mse'], multioutput="uniform_average", batch_size=32): ''' Evaluate the time series pipeline with onnx. :param data: data can be a TSDataset or data creator(will be supported). The TSDataset should follow the same operations as the training TSDataset used in AutoTSEstimator.fit. :param metrics: list. The evaluation metric name to optimize. e.g. ["mse"] :param multioutput: Defines aggregating of multiple output values. String in ['raw_values', 'uniform_average']. The value defaults to 'uniform_average'. :param batch_size: predict batch_size, the process will cost more time if batch_size is small while cost less memory. The param is only effective when data is a TSDataset. The values defaults to 32. ''' # predict with onnx x, y = self._tsdataset_to_numpy(data, is_predict=False) yhat = self._best_model.predict_with_onnx(x, batch_size=batch_size) yhat = self._tsdataset_unscale(yhat) # unscale y = self._tsdataset_unscale(y) # evaluate eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput=multioutput) for m in metrics ] return eval_result
def evaluate(self, x, target, metrics=['mse']): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param target: target for evaluation. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x currently") if target is None: raise ValueError("Input invalid target of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling evaluate" ) horizon = len(target) target = target[['y']] future = self.model.make_future_dataframe(periods=horizon) target_pred = self.predict(horizon=horizon)[['yhat']] return [ Evaluator.evaluate(m, target.values, target_pred.values) for m in metrics ]
def evaluate(self, x, y, metric=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) # y = np.squeeze(y, axis=2) if self.target_col_num == 1: return [Evaluator.evaluate(m, y, y_pred) for m in metric] else: return [np.array([Evaluator.evaluate(m, y[:, i, :], y_pred[:, i, :]) for i in range(self.future_seq_len)]) for m in metric]
def test_evaluate_predict_future_more_1(self): target_col = "values" metrics = ["mse", "r2"] future_seq_len = np.random.randint(2, 6) train_df, test_df, tsp, test_sample_num = self.get_input_tsp( future_seq_len, target_col) pipeline = tsp.fit(train_df, test_df) mse, rs = pipeline.evaluate(test_df, metrics=metrics) assert len(mse) == future_seq_len assert len(rs) == future_seq_len y_pred = pipeline.predict(test_df) assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, future_seq_len + 1) y_pred_df = pipeline.predict(test_df[:-future_seq_len]) columns = [ "{}_{}".format(target_col, i) for i in range(future_seq_len) ] y_pred_value = y_pred_df[columns].values y_df = test_df[default_past_seq_len:] y_value = TimeSequenceFeatureTransformer()._roll_test( y_df[target_col], future_seq_len) mse_pred_eval, rs_pred_eval = [ Evaluator.evaluate(m, y_value, y_pred_value) for m in metrics ] mse_eval, rs_eval = pipeline.evaluate(test_df, metrics) assert_array_almost_equal(mse_pred_eval, mse_eval, decimal=2) assert_array_almost_equal(rs_pred_eval, rs_eval, decimal=2)
def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'): """ evaluate the pipeline :param input_df: :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE'] :param multioutput: string in ['raw_values', 'uniform_average'] 'raw_values' : Returns a full set of errors in case of multioutput input. 'uniform_average' : Errors of all outputs are averaged with uniform weight. :return: """ if isinstance(metrics, str): metrics = [metrics] # if not isinstance(metrics, list): # raise ValueError("Expected metrics to be a list!") x, y = self.feature_transformers.transform(input_df, is_train=True) y_pred = self.model.predict(x) if y_pred.shape[1] == 1: multioutput = 'uniform_average' y_unscale, y_pred_unscale = self.feature_transformers.post_processing( input_df, y_pred, is_train=True) return [ Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput) for m in metrics ]
def evaluate(self, data, metrics=['mse'], multioutput="uniform_average", batch_size=32): ''' Evaluate the time series pipeline. :param data: data can be a TSDataset or data creator(will be supported). The TSDataset should follow the same operations as the training TSDataset used in AutoTSEstimator.fit. :param metrics: list. The evaluation metric name to optimize. e.g. ["mse"] :param multioutput: Defines aggregating of multiple output values. String in ['raw_values', 'uniform_average']. The value defaults to 'uniform_average'. :param batch_size: predict batch_size, the process will cost more time if batch_size is small while cost less memory. The param is only effective when data is a TSDataset. The values defaults to 32. ''' _, y = self._tsdataset_to_numpy(data, is_predict=False) yhat = self.predict(data, batch_size=batch_size) if self._scaler: from zoo.chronos.data.utils.scale import unscale_timeseries_numpy y = unscale_timeseries_numpy(y, self._scaler, self._scaler_index) eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat[:y.shape[0]], multioutput=multioutput) for m in metrics ] return eval_result
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if x is None: raise ValueError("Input invalid x of None") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) if isinstance(y, pd.DataFrame): y = y.values self.model.n_jobs = self.n_jobs y_pred = self.predict(x) result_list = [] for metric in metrics: if callable(metric): result_list.append(metric(y, y_pred)) else: result_list.append(Evaluator.evaluate(metric, y, y_pred)) return result_list
def train_func(config): # make a copy from global variables for trial to make changes global_ft = ray.get(ft_id) trial_ft = deepcopy(global_ft) if isinstance(model_create_func, ModelBuilder): trial_model = model_create_func.build(config) else: trial_model = model_create_func() imputer = None if "imputation" in config: if config["imputation"] == "LastFillImpute": imputer = LastFillImpute() elif config["imputation"] == "FillZeroImpute": imputer = FillZeroImpute() # handling input global_input_df = ray.get(input_df_id) trial_input_df = deepcopy(global_input_df) if imputer: trial_input_df = imputer.impute(trial_input_df) config = convert_bayes_configs(config).copy() # print("config is ", config) (x_train, y_train) = trial_ft.fit_transform(trial_input_df, **config) # trial_ft.fit(trial_input_df, **config) # handling validation data validation_data = None if is_val_df_valid: global_validation_df = ray.get(validation_df_id) trial_validation_df = deepcopy(global_validation_df) validation_data = trial_ft.transform(trial_validation_df) # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward_m = None # print("config:", config) for i in range(1, 101): result = trial_model.fit_eval( x_train, y_train, validation_data=validation_data, mc=mc, metric=metric, # verbose=1, **config) reward_m = result if Evaluator.get_metric_mode( metric) == "max" else -result ckpt_name = "best.ckpt" if best_reward_m is None or reward_m > best_reward_m: best_reward_m = reward_m save_zip(ckpt_name, trial_ft, trial_model, config) if remote_dir is not None: upload_ppl_hdfs(remote_dir, ckpt_name) tune.track.log(training_iteration=i, reward_metric=reward_m, checkpoint="best.ckpt")
def evaluate(self, x=None, y=None, metrics=None, num_workers=None): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :param num_workers: the number of workers to use in evaluate. It defaults to 1. :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x directly.") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) if len(y.shape) == 1: y = np.expand_dims(y, axis=1) horizon = 1 else: horizon = y.shape[1] result = self.predict(x=None, horizon=horizon, num_workers=num_workers) if y.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' return [ Evaluator.evaluate(m, y, result, multioutput=multioutput) for m in metrics ]
def _make_pipeline(self, analysis, feature_transformers, model, remote_dir): metric = self.metric mode = Evaluator.get_metric_mode(metric) best_config = analysis.get_best_config(metric=metric, mode=mode) best_logdir = analysis.get_best_logdir(metric=metric, mode=mode) print("best log dir is ", best_logdir) dataframe = analysis.dataframe(metric=metric, mode=mode) # print(dataframe) model_path = os.path.join(best_logdir, dataframe["checkpoint"].iloc[0]) config = convert_bayes_configs(best_config).copy() self._print_config(config) if remote_dir is not None: all_config = restore_hdfs(model_path, remote_dir, feature_transformers, model, # config) ) else: all_config = restore_zip(model_path, feature_transformers, model, # config) ) return TimeSequencePipeline(name=self.name, feature_transformers=feature_transformers, model=model, config=all_config)
def evaluate_with_onnx(self, x, y, metrics=['mse'], dirname=None, multioutput="raw_values"): # reshape 1dim input x = self._reshape_input(x) y = self._reshape_input(y) yhat = self.predict_with_onnx(x, dirname=dirname) eval_result = [Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput=multioutput) for m in metrics] return eval_result
def evaluate(self, x, y, metric=['mse']): yhat = self.predict(x) eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput="raw_values") for m in metric ] return eval_result
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) return [Evaluator.evaluate(m, y, y_pred) for m in metrics]
def _validate(self, x, y, metric): self.model.eval() with torch.no_grad(): yhat = self.model(x) val_loss = self.criterion(yhat, y) eval_result = Evaluator.evaluate(metric=metric, y_true=y.numpy(), y_pred=yhat.numpy(), multioutput='uniform_average') return {"val_loss": val_loss.item(), metric: eval_result}
def evaluate(self, x=None, y=None, metrics=None, target_covariates=None, target_dti=None, num_workers=None): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :param target_covariates: covariates corresponding to target_value. 2-D ndarray or None. The shape of ndarray should be (r, horizon), where r is the number of covariates. Global covariates for all time series. If None, only default time coveriates will be used while use_time is True. If not, the time coveriates used is the stack of input covariates and default time coveriates. :param target_dti: dti corresponding to target_value. DatetimeIndex or None. If None, use default fixed frequency DatetimeIndex generated with the last date of x in fit and freq. :param num_workers: the number of workers to use in evaluate. It defaults to 1. :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x directly.") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) if len(y.shape) == 1: y = np.expand_dims(y, axis=1) horizon = 1 else: horizon = y.shape[1] result = self.predict(x=None, horizon=horizon, future_covariates=target_covariates, future_dti=target_dti, num_workers=num_workers) if y.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' return [ Evaluator.evaluate(m, y, result, multioutput=multioutput) for m in metrics ]
def _detach_recipe(self, recipe): self.search_space = recipe.search_space() stop = recipe.runtime_params() self.metric_threshold = None if "reward_metric" in stop.keys(): self.mode = Evaluator.get_metric_mode(self.metric) self.metric_threshold = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] self.epochs = stop["training_iteration"] self.num_samples = stop["num_samples"]
def evaluate(self, x, y, metrics=['mse']): # reshape 1dim input x = self._reshape_input(x) y = self._reshape_input(y) yhat = self.predict(x) eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput="raw_values") for m in metrics ] return eval_result
def _validate_metric_mode(metric, mode): from zoo.automl.common.metrics import Evaluator if not mode: try: mode = Evaluator.get_metric_mode(metric) except ValueError: pass if not mode: raise ValueError(f"We cannot infer metric mode with metric name of {metric}. " f"Please specify the `metric_mode` parameter in AutoEstimator.fit().") if mode not in ["min", "max"]: raise ValueError("`mode` has to be one of ['min', 'max']") return mode
def _train(self): # print("self.config in train is ", self.config) result = self.trial_model.fit_eval(self.x_train, self.y_train, validation_data=self.validation_data, # verbose=1, **self.config) self.reward_m = result if Evaluator.get_metric_mode(metric) == "max" else -result # if metric == "mean_squared_error": # self.reward_m = (-1) * result # # print("running iteration: ",i) # elif metric == "r_square": # self.reward_m = result # else: # raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"") return {"reward_metric": self.reward_m, "checkpoint": self.ckpt_name}
def _validate_metric_mode(metric, mode): if not mode: if callable(metric): raise ValueError("You must specify `metric_mode` for your metric function") try: from zoo.automl.common.metrics import Evaluator mode = Evaluator.get_metric_mode(metric) except ValueError: pass if not mode: raise ValueError(f"We cannot infer metric mode with metric name of {metric}. Please" f" specify the `metric_mode` parameter in AutoEstimator.fit().") if mode not in ["min", "max"]: raise ValueError("`mode` has to be one of ['min', 'max']") return mode
def _check_input(self, input_df, validation_df, metric): input_is_list = self._check_input_format(input_df) if not input_is_list: self._check_missing_col(input_df) if validation_df is not None: self._check_missing_col(validation_df) else: for d in input_df: self._check_missing_col(d) if validation_df is not None: for val_d in validation_df: self._check_missing_col(val_d) if not Evaluator.check_metric(metric): raise ValueError("metric " + metric + " is not supported")
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) if y_pred.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' # y = np.squeeze(y, axis=2) return [Evaluator.evaluate(m, y, y_pred, multioutput=multioutput) for m in metrics]
def _validate(self, validation_loader, metric): self.model.eval() with torch.no_grad(): yhat_list = [] y_list = [] for x_valid_batch, y_valid_batch in validation_loader: yhat_list.append(self.model(x_valid_batch).numpy()) y_list.append(y_valid_batch.numpy()) yhat = np.concatenate(yhat_list, axis=0) y = np.concatenate(y_list, axis=0) # val_loss = self.criterion(yhat, y) eval_result = Evaluator.evaluate(metric=metric, y_true=y, y_pred=yhat, multioutput='uniform_average') return {metric: eval_result}
def evaluate(self, df, metric=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ if isinstance(metric, str): metric = [metric] x, y = self._process_data(df, mode="val") y_pred = self.model.predict(x) y_unscale, y_pred_unscale = self.ft.post_processing(df, y_pred, is_train=True) if len(y_pred.shape) > 1 and y_pred.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' return [Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput) for m in metric]
def _validate(self, validation_loader, metric_name, metric_func=None): if not metric_name: assert metric_func, "You must input valid metric_func or metric_name" metric_name = metric_func.__name__ self.model.eval() with torch.no_grad(): yhat_list = [] y_list = [] for x_valid_batch, y_valid_batch in validation_loader: yhat_list.append(self.model(x_valid_batch).numpy()) y_list.append(y_valid_batch.numpy()) yhat = np.concatenate(yhat_list, axis=0) y = np.concatenate(y_list, axis=0) # val_loss = self.criterion(yhat, y) if metric_func: eval_result = metric_func(y, yhat) else: eval_result = Evaluator.evaluate(metric=metric_name, y_true=y, y_pred=yhat, multioutput='uniform_average') return {metric_name: eval_result}
def train_func(config): train_data = ray.get(data_id) val_data = ray.get(validation_data_id) config = convert_bayes_configs(config).copy() if not isinstance(model_builder, ModelBuilder): raise ValueError(f"You must input a ModelBuilder instance for model_builder") trial_model = model_builder.build(config) # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward = None for i in range(1, 101): result = trial_model.fit_eval(data=train_data, validation_data=val_data, mc=mc, metric=metric, **config) reward = result checkpoint_filename = "best.ckpt" # Save best reward iteration mode = Evaluator.get_metric_mode(metric) if mode == "max": has_best_reward = best_reward is None or reward > best_reward else: has_best_reward = best_reward is None or reward < best_reward if has_best_reward: best_reward = reward trial_model.save(checkpoint_filename) # Save to hdfs if remote_dir is not None: put_ckpt_hdfs(remote_dir, checkpoint_filename) report_dict = {"training_iteration": i, metric: reward, "checkpoint": checkpoint_filename, "best_" + metric: best_reward} tune.report(**report_dict)
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if x is None: raise ValueError("Input invalid x of None") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) y_pred = self.predict(x) return [Evaluator.evaluate(m, y, y_pred) for m in metrics]
def evaluate(self, target, data=None, metrics=['mse']): """ Evaluate on the prediction results. We predict horizon time-points ahead the input data in fit_eval before evaluation, where the horizon length equals the second dimension size of target. :param data: Prophet predicts the horizon steps foreward from the training data. So data should be None as it is not used. :param target: target for evaluation. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if data is not None: raise ValueError("We don't support input data currently") if target is None: raise ValueError("Input invalid target of None") if self.model is None: raise Exception("Needs to call fit_eval or restore first before calling evaluate") horizon = len(target) future = self.model.make_future_dataframe(periods=horizon) target_pred = self.predict(horizon=horizon)[['yhat']] return [Evaluator.evaluate(m, target[['y']].values, target_pred.values) for m in metrics]