def evaluate(self, input_df, metric=None): """ Evaluate the model on a list of metrics. :param input_df: The input time series data frame, Example: datetime value "extra feature 1" "extra feature 2" 2019-01-01 1.9 1 2 2019-01-02 2.3 0 2 :param metric: A list of Strings Available string values are "mean_squared_error", "r_square". :return: a list of metric evaluation results. """ Evaluator.check_metric(metric) return self.pipeline.evaluate(input_df, metric)
def evaluate(self, target, data=None, metrics=['mse']): """ Evaluate on the prediction results. We predict horizon time-points ahead the input data in fit_eval before evaluation, where the horizon length equals the second dimension size of target. :param data: Prophet predicts the horizon steps foreward from the training data. So data should be None as it is not used. :param target: target for evaluation. A dataframe with 2 columns, where column 'ds' indicating date and column 'y' indicating target. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if data is not None: raise ValueError("We don't support input data currently") if target is None: raise ValueError("Input invalid target of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling evaluate" ) target_pred = self.model.predict(target) return [ Evaluator.evaluate(m, target.y.values, target_pred.yhat.values) for m in metrics ]
def evaluate_with_onnx(self, data, metrics=['mse'], multioutput="uniform_average", batch_size=32): ''' Evaluate the time series pipeline with onnx. :param data: data can be a TSDataset or data creator(will be supported). The TSDataset should follow the same operations as the training TSDataset used in AutoTSEstimator.fit. :param metrics: list. The evaluation metric name to optimize. e.g. ["mse"] :param multioutput: Defines aggregating of multiple output values. String in ['raw_values', 'uniform_average']. The value defaults to 'uniform_average'. :param batch_size: predict batch_size, the process will cost more time if batch_size is small while cost less memory. The param is only effective when data is a TSDataset. The values defaults to 32. ''' # predict with onnx x, y = self._tsdataset_to_numpy(data, is_predict=False) yhat = self._best_model.predict_with_onnx(x, batch_size=batch_size) yhat = self._tsdataset_unscale(yhat) # unscale y = self._tsdataset_unscale(y) # evaluate eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput=multioutput) for m in metrics ] return eval_result
def test_evaluate_predict_future_more_1(self): target_col = "values" metrics = ["mse", "r2"] future_seq_len = np.random.randint(2, 6) train_df, test_df, tsp, test_sample_num = self.get_input_tsp( future_seq_len, target_col) pipeline = tsp.fit(train_df, test_df) mse, rs = pipeline.evaluate(test_df, metrics=metrics) assert len(mse) == future_seq_len assert len(rs) == future_seq_len y_pred = pipeline.predict(test_df) assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, future_seq_len + 1) y_pred_df = pipeline.predict(test_df[:-future_seq_len]) columns = [ "{}_{}".format(target_col, i) for i in range(future_seq_len) ] y_pred_value = y_pred_df[columns].values y_df = test_df[default_past_seq_len:] y_value = TimeSequenceFeatureTransformer()._roll_test( y_df[target_col], future_seq_len) mse_pred_eval, rs_pred_eval = [ Evaluator.evaluate(m, y_value, y_pred_value) for m in metrics ] mse_eval, rs_eval = pipeline.evaluate(test_df, metrics) assert_array_almost_equal(mse_pred_eval, mse_eval, decimal=2) assert_array_almost_equal(rs_pred_eval, rs_eval, decimal=2)
def evaluate(self, df, metric=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ if isinstance(metric, str): metric = [metric] x, y = self._process_data(df, mode="val") y_pred = self.model.predict(x) y_unscale, y_pred_unscale = self.ft.post_processing(df, y_pred, is_train=True) if len(y_pred.shape) > 1 and y_pred.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' return [ Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput) for m in metric ]
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ if x is None: raise ValueError("Input invalid x of None") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception("Needs to call fit_eval or restore first before calling predict") if isinstance(y, pd.DataFrame): y = y.values self.model.n_jobs = self.n_jobs y_pred = self.predict(x) result_list = [] for metric in metrics: if callable(metric): result_list.append(metric(y, y_pred)) else: result_list.append(Evaluator.evaluate(metric, y, y_pred)) return result_list
def evaluate(self, x, y, metric=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) # y = np.squeeze(y, axis=2) if self.target_col_num == 1: return [Evaluator.evaluate(m, y, y_pred) for m in metric] else: return [ np.array([ Evaluator.evaluate(m, y[:, i, :], y_pred[:, i, :]) for i in range(self.future_seq_len) ]) for m in metric ]
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metrics: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) return [Evaluator.evaluate(m, y, y_pred) for m in metrics]
def _detach_recipe(self, recipe): self.search_space = recipe.search_space() stop = recipe.runtime_params() self.metric_threshold = None if "reward_metric" in stop.keys(): self.mode = Evaluator.get_metric_mode(self.metric) self.metric_threshold = -stop["reward_metric"] if \ self.mode == "min" else stop["reward_metric"] self.epochs = stop["training_iteration"] self.num_samples = stop["num_samples"]
def evaluate(self, x, y, metrics=['mse']): """ Evaluate on x, y :param x: input :param y: target :param metric: a list of metrics in string format :return: a list of metric evaluation results """ y_pred = self.predict(x) if y_pred.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' # y = np.squeeze(y, axis=2) return [Evaluator.evaluate(m, y, y_pred, multioutput=multioutput) for m in metrics]
def _validate_metric_mode(metric, mode): if not mode: if callable(metric): raise ValueError( "You must specify `metric_mode` for your metric function") try: from bigdl.orca.automl.metrics import Evaluator mode = Evaluator.get_metric_mode(metric) except ValueError: pass if not mode: raise ValueError( f"We cannot infer metric mode with metric name of {metric}. Please" f" specify the `metric_mode` parameter in AutoEstimator.fit()." ) if mode not in ["min", "max"]: raise ValueError("`mode` has to be one of ['min', 'max']") return mode
def evaluate(self, x, y, metrics=['mse'], multioutput="raw_values", batch_size=32): # reshape 1dim input x = self._reshape_input(x) y = self._reshape_input(y) yhat = self.predict(x, batch_size=batch_size) eval_result = [ Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput=multioutput) for m in metrics ] return eval_result
def evaluate(self, x=None, y=None, metrics=None, target_covariates=None, target_dti=None, num_workers=None): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param x: We don't support input x currently. :param y: target. We interpret the second dimension of y as the horizon length for evaluation. :param metrics: a list of metrics in string format :param target_covariates: covariates corresponding to target_value. 2-D ndarray or None. The shape of ndarray should be (r, horizon), where r is the number of covariates. Global covariates for all time series. If None, only default time coveriates will be used while use_time is True. If not, the time coveriates used is the stack of input covariates and default time coveriates. :param target_dti: dti corresponding to target_value. DatetimeIndex or None. If None, use default fixed frequency DatetimeIndex generated with the last date of x in fit and freq. :param num_workers: the number of workers to use in evaluate. It defaults to 1. :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x directly.") if y is None: raise ValueError("Input invalid y of None") if self.model is None: raise Exception("Needs to call fit_eval or restore first before calling predict") if len(y.shape) == 1: y = np.expand_dims(y, axis=1) horizon = 1 else: horizon = y.shape[1] result = self.predict(x=None, horizon=horizon, future_covariates=target_covariates, future_dti=target_dti, num_workers=num_workers) if y.shape[1] == 1: multioutput = 'uniform_average' else: multioutput = 'raw_values' return [Evaluator.evaluate(m, y, result, multioutput=multioutput) for m in metrics]
def _validate(self, validation_loader, metric_name, metric_func=None): if not metric_name: assert metric_func, "You must input valid metric_func or metric_name" metric_name = metric_func.__name__ self.model.eval() with torch.no_grad(): yhat_list = [] y_list = [] for x_valid_batch, y_valid_batch in validation_loader: yhat_list.append(self.model(x_valid_batch).numpy()) y_list.append(y_valid_batch.numpy()) yhat = np.concatenate(yhat_list, axis=0) y = np.concatenate(y_list, axis=0) # val_loss = self.criterion(yhat, y) if metric_func: eval_result = metric_func(y, yhat) else: eval_result = Evaluator.evaluate(metric=metric_name, y_true=y, y_pred=yhat, multioutput='uniform_average') return {metric_name: eval_result}
def train_example(args): auto_est = AutoEstimator.from_torch( model_creator=model_creator, optimizer="Adam", loss="BCELoss", logs_dir="/tmp/zoo_automl_logs", resources_per_trial={"cpu": args.cpus_per_trial}, name="test_fit") train_data, val_data = get_train_val_data() auto_est.fit(data=train_data, epochs=args.epochs, validation_data=val_data, metric="accuracy", n_sampling=args.trials, search_space=create_linear_search_space()) # Choose the best model best_model = auto_est.get_best_model() y_hat = best_model(torch.from_numpy(val_data[0]).float()).detach().numpy() from bigdl.orca.automl.metrics import Evaluator accuracy = Evaluator.evaluate(metric="accuracy", y_true=val_data[1], y_pred=y_hat) print("Evaluate: accuracy is", accuracy)
def evaluate(self, target, x=None, metrics=['mse'], rolling=False): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param target: target for evaluation. :param x: ARIMA predicts the horizon steps foreward from the training data. So x should be None as it is not used. :param metrics: a list of metrics in string format :param rolling: whether to use rolling prediction :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x currently") if target is None: raise ValueError("Input invalid target of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling evaluate" ) forecasts = self.predict(horizon=len(target), rolling=rolling) return [Evaluator.evaluate(m, target, forecasts) for m in metrics]
output_target_num=1, past_seq_len=20, hidden_dim=hp.grid_search([32, 64]), layer_num=hp.randint(1, 3), lr=hp.choice([0.01, 0.03, 0.1]), dropout=hp.uniform(0.1, 0.2), optimizer='Adam', loss=torch.nn.MSELoss(), metric="mse") x_train, y_train = tsdata_train.roll(lookback=20, horizon=1).to_numpy() x_val, y_val = tsdata_test.roll(lookback=20, horizon=1).to_numpy() x_test, y_test = tsdata_test.roll(lookback=20, horizon=1).to_numpy() auto_lstm.fit(data=(x_train, y_train), epochs=args.epochs, validation_data=(x_val, y_val)) yhat = auto_lstm.predict(x_test) unscale_y_test = tsdata_test.unscale_numpy(y_test) unscale_yhat = tsdata_test.unscale_numpy(yhat) rmse, smape = [ Evaluator.evaluate(m, y_true=unscale_y_test, y_pred=unscale_yhat) for m in ['rmse', 'smape'] ] print(f'rmse is {np.mean(rmse)}') print(f'sampe is {np.mean(smape)}') stop_orca_context()
"max_depth": hp.grid_search(list(max_depth_range)), "lr": hp.loguniform(1e-4, 1e-1), "min_child_weight": hp.choice(min_child_weight), } search_alg = None search_alg_params = None scheduler = None scheduler_params = None auto_xgb_reg = AutoXGBRegressor(cpus_per_trial=2, name="auto_xgb_regressor", **config) auto_xgb_reg.fit(data=(X_train, y_train), validation_data=(X_val, y_val), metric="rmse", n_sampling=num_rand_samples, search_space=search_space, search_alg=search_alg, search_alg_params=None, scheduler=scheduler, scheduler_params=scheduler_params) print("Training completed.") best_model = auto_xgb_reg.get_best_model() y_hat = best_model.predict(X_val) from bigdl.orca.automl.metrics import Evaluator rmse = Evaluator.evaluate(metric="rmse", y_true=y_val, y_pred=y_hat) print(f"Evaluate: the square root of mean square error is {rmse:.2f}") stop_orca_context()
logger.info('Stopping context for yarn cluster and init context on local.') stop_orca_context() import ray ray.init(num_cpus=args.num_predict_cores) logger.info('Start prediction.') yhat = model.predict(horizon=horizon, num_workers=args.num_predict_workers if args.predict_local else args.num_workers) logger.info("Prediction ends") yhat = yhat["prediction"] target_value = dict({"y": target_data}) # evaluate with prediction results from bigdl.orca.automl.metrics import Evaluator evaluate_mse = Evaluator.evaluate("mse", target_data, yhat) # You can also evaluate directly without prediction results. mse, smape = model.evaluate(target_value=target_value, metric=['mse', 'smape'], num_workers=args.num_predict_workers if args.predict_local else args.num_workers) print(f"Evaluation results:\nmse: {mse}, \nsmape: {smape}") logger.info("Evaluation ends") # incremental fitting logger.info("Start fit incremental") model.fit_incremental({'y': target_data}) logger.info("Start evaluation after fit incremental") incr_target_value = dict({"y": incr_target_data}) mse, smape = model.evaluate(target_value=incr_target_value, metric=['mse', 'smape'], num_workers=args.num_predict_workers
tsdata_train, tsdata_test = get_tsdata() x_train, y_train = tsdata_train.to_numpy() x_test, y_test = tsdata_test.to_numpy() forecaster = Seq2SeqForecaster(past_seq_len=100, future_seq_len=10, input_feature_num=x_train.shape[-1], output_feature_num=2, metrics=['mse'], distributed=True, workers_per_node=args.workers_per_node, seed=0) forecaster.fit((x_train, y_train), epochs=args.epochs, batch_size=512) yhat = forecaster.predict(x_test) unscale_yhat = tsdata_test.unscale_numpy(yhat) unscale_y_test = tsdata_test.unscale_numpy(y_test) rmse, smape = [ Evaluator.evaluate(m, y_true=unscale_y_test, y_pred=unscale_yhat, multioutput='raw_values') for m in ['rmse', 'smape'] ] print(f'rmse is: {np.mean(rmse)}') print(f'smape is: {np.mean(smape):.4f}') stop_orca_context()