Esempio n. 1
0
 def evaluate(self, x, y, metric=['mean_squared_error']):
     """
     Evaluate on x, y
     :param x: input
     :param y: target
     :param metric: a list of metrics in string format
     :return: a list of metric evaluation results
     """
     e = Evaluator()
     y_pred = self.predict(x)
     return [e.evaluate(m, y, y_pred) for m in metric]
Esempio n. 2
0
 def evaluate(self, input_df, metric=None):
     """
     Evaluate the model on a list of metrics.
     :param input_df: The input time series data frame, Example:
      datetime   value   "extra feature 1"   "extra feature 2"
      2019-01-01 1.9 1   2
      2019-01-02 2.3 0   2
     :param metric: A list of Strings Available string values are "mean_squared_error",
                   "r_square".
     :return: a list of metric evaluation results.
     """
     Evaluator.check_metric(metric)
     return self.pipeline.evaluate(input_df, metric)
Esempio n. 3
0
    def evaluate_with_onnx(self,
                           data,
                           metrics=['mse'],
                           multioutput="uniform_average",
                           batch_size=32):
        '''
        Evaluate the time series pipeline with onnx.

        :param data: data can be a TSDataset or data creator(will be supported).
               The TSDataset should follow the same operations as the training
               TSDataset used in AutoTSEstimator.fit.
        :param metrics: list. The evaluation metric name to optimize. e.g. ["mse"]
        :param multioutput: Defines aggregating of multiple output values.
               String in ['raw_values', 'uniform_average']. The value defaults to
               'uniform_average'.
        :param batch_size: predict batch_size, the process will cost more time
               if batch_size is small while cost less memory. The param is only
               effective when data is a TSDataset. The values defaults to 32.
        '''
        # predict with onnx
        x, y = self._tsdataset_to_numpy(data, is_predict=False)
        yhat = self._best_model.predict_with_onnx(x, batch_size=batch_size)
        yhat = self._tsdataset_unscale(yhat)
        # unscale
        y = self._tsdataset_unscale(y)
        # evaluate
        eval_result = [
            Evaluator.evaluate(m,
                               y_true=y,
                               y_pred=yhat,
                               multioutput=multioutput) for m in metrics
        ]
        return eval_result
Esempio n. 4
0
    def evaluate(self, x, target, metrics=['mse']):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param x: We don't support input x currently.
        :param target: target for evaluation.
        :param metrics: a list of metrics in string format
        :return: a list of metric evaluation results
        """
        if x is not None:
            raise ValueError("We don't support input x currently")
        if target is None:
            raise ValueError("Input invalid target of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling evaluate"
            )

        horizon = len(target)
        target = target[['y']]
        future = self.model.make_future_dataframe(periods=horizon)
        target_pred = self.predict(horizon=horizon)[['yhat']]
        return [
            Evaluator.evaluate(m, target.values, target_pred.values)
            for m in metrics
        ]
Esempio n. 5
0
 def evaluate(self, x, y, metric=['mse']):
     """
     Evaluate on x, y
     :param x: input
     :param y: target
     :param metric: a list of metrics in string format
     :return: a list of metric evaluation results
     """
     y_pred = self.predict(x)
     # y = np.squeeze(y, axis=2)
     if self.target_col_num == 1:
         return [Evaluator.evaluate(m, y, y_pred) for m in metric]
     else:
         return [np.array([Evaluator.evaluate(m, y[:, i, :], y_pred[:, i, :])
                           for i in range(self.future_seq_len)])
                 for m in metric]
Esempio n. 6
0
    def test_evaluate_predict_future_more_1(self):
        target_col = "values"
        metrics = ["mse", "r2"]
        future_seq_len = np.random.randint(2, 6)
        train_df, test_df, tsp, test_sample_num = self.get_input_tsp(
            future_seq_len, target_col)
        pipeline = tsp.fit(train_df, test_df)
        mse, rs = pipeline.evaluate(test_df, metrics=metrics)
        assert len(mse) == future_seq_len
        assert len(rs) == future_seq_len
        y_pred = pipeline.predict(test_df)
        assert y_pred.shape == (test_sample_num - default_past_seq_len + 1,
                                future_seq_len + 1)

        y_pred_df = pipeline.predict(test_df[:-future_seq_len])
        columns = [
            "{}_{}".format(target_col, i) for i in range(future_seq_len)
        ]
        y_pred_value = y_pred_df[columns].values

        y_df = test_df[default_past_seq_len:]
        y_value = TimeSequenceFeatureTransformer()._roll_test(
            y_df[target_col], future_seq_len)

        mse_pred_eval, rs_pred_eval = [
            Evaluator.evaluate(m, y_value, y_pred_value) for m in metrics
        ]
        mse_eval, rs_eval = pipeline.evaluate(test_df, metrics)
        assert_array_almost_equal(mse_pred_eval, mse_eval, decimal=2)
        assert_array_almost_equal(rs_pred_eval, rs_eval, decimal=2)
Esempio n. 7
0
    def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'):
        """
        evaluate the pipeline
        :param input_df:
        :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE']
        :param multioutput: string in ['raw_values', 'uniform_average']
                'raw_values' :
                    Returns a full set of errors in case of multioutput input.
                'uniform_average' :
                    Errors of all outputs are averaged with uniform weight.
        :return:
        """
        if isinstance(metrics, str):
            metrics = [metrics]
        # if not isinstance(metrics, list):
        #    raise ValueError("Expected metrics to be a list!")

        x, y = self.feature_transformers.transform(input_df, is_train=True)
        y_pred = self.model.predict(x)
        if y_pred.shape[1] == 1:
            multioutput = 'uniform_average'
        y_unscale, y_pred_unscale = self.feature_transformers.post_processing(
            input_df, y_pred, is_train=True)

        return [
            Evaluator.evaluate(m,
                               y_unscale,
                               y_pred_unscale,
                               multioutput=multioutput) for m in metrics
        ]
Esempio n. 8
0
    def evaluate(self,
                 data,
                 metrics=['mse'],
                 multioutput="uniform_average",
                 batch_size=32):
        '''
        Evaluate the time series pipeline.

        :param data: data can be a TSDataset or data creator(will be supported).
               The TSDataset should follow the same operations as the training
               TSDataset used in AutoTSEstimator.fit.
        :param metrics: list. The evaluation metric name to optimize. e.g. ["mse"]
        :param multioutput: Defines aggregating of multiple output values.
               String in ['raw_values', 'uniform_average']. The value defaults to
               'uniform_average'.
        :param batch_size: predict batch_size, the process will cost more time
               if batch_size is small while cost less memory. The param is only
               effective when data is a TSDataset. The values defaults to 32.
        '''
        _, y = self._tsdataset_to_numpy(data, is_predict=False)
        yhat = self.predict(data, batch_size=batch_size)
        if self._scaler:
            from zoo.chronos.data.utils.scale import unscale_timeseries_numpy
            y = unscale_timeseries_numpy(y, self._scaler, self._scaler_index)
        eval_result = [
            Evaluator.evaluate(m,
                               y_true=y,
                               y_pred=yhat[:y.shape[0]],
                               multioutput=multioutput) for m in metrics
        ]
        return eval_result
Esempio n. 9
0
    def evaluate(self, x, y, metrics=['mse']):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param x: We don't support input x currently.
        :param y: target. We interpret the second dimension of y as the horizon length for
            evaluation.
        :param metrics: a list of metrics in string format
        :return: a list of metric evaluation results
        """
        if x is None:
            raise ValueError("Input invalid x of None")
        if y is None:
            raise ValueError("Input invalid y of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )

        if isinstance(y, pd.DataFrame):
            y = y.values
        self.model.n_jobs = self.n_jobs
        y_pred = self.predict(x)

        result_list = []
        for metric in metrics:
            if callable(metric):
                result_list.append(metric(y, y_pred))
            else:
                result_list.append(Evaluator.evaluate(metric, y, y_pred))
        return result_list
Esempio n. 10
0
        def train_func(config):
            # make a copy from global variables for trial to make changes
            global_ft = ray.get(ft_id)
            trial_ft = deepcopy(global_ft)
            if isinstance(model_create_func, ModelBuilder):
                trial_model = model_create_func.build(config)
            else:
                trial_model = model_create_func()

            imputer = None
            if "imputation" in config:
                if config["imputation"] == "LastFillImpute":
                    imputer = LastFillImpute()
                elif config["imputation"] == "FillZeroImpute":
                    imputer = FillZeroImpute()

            # handling input
            global_input_df = ray.get(input_df_id)
            trial_input_df = deepcopy(global_input_df)
            if imputer:
                trial_input_df = imputer.impute(trial_input_df)
            config = convert_bayes_configs(config).copy()
            # print("config is ", config)
            (x_train,
             y_train) = trial_ft.fit_transform(trial_input_df, **config)
            # trial_ft.fit(trial_input_df, **config)

            # handling validation data
            validation_data = None
            if is_val_df_valid:
                global_validation_df = ray.get(validation_df_id)
                trial_validation_df = deepcopy(global_validation_df)
                validation_data = trial_ft.transform(trial_validation_df)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward_m = None
            # print("config:", config)
            for i in range(1, 101):
                result = trial_model.fit_eval(
                    x_train,
                    y_train,
                    validation_data=validation_data,
                    mc=mc,
                    metric=metric,
                    # verbose=1,
                    **config)
                reward_m = result if Evaluator.get_metric_mode(
                    metric) == "max" else -result
                ckpt_name = "best.ckpt"
                if best_reward_m is None or reward_m > best_reward_m:
                    best_reward_m = reward_m
                    save_zip(ckpt_name, trial_ft, trial_model, config)
                    if remote_dir is not None:
                        upload_ppl_hdfs(remote_dir, ckpt_name)

                tune.track.log(training_iteration=i,
                               reward_metric=reward_m,
                               checkpoint="best.ckpt")
Esempio n. 11
0
    def evaluate(self, x=None, y=None, metrics=None, num_workers=None):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param x: We don't support input x currently.
        :param y: target. We interpret the second dimension of y as the horizon length for
            evaluation.
        :param metrics: a list of metrics in string format
        :param num_workers: the number of workers to use in evaluate. It defaults to 1.
        :return: a list of metric evaluation results
        """
        if x is not None:
            raise ValueError("We don't support input x directly.")
        if y is None:
            raise ValueError("Input invalid y of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )
        if len(y.shape) == 1:
            y = np.expand_dims(y, axis=1)
            horizon = 1
        else:
            horizon = y.shape[1]
        result = self.predict(x=None, horizon=horizon, num_workers=num_workers)

        if y.shape[1] == 1:
            multioutput = 'uniform_average'
        else:
            multioutput = 'raw_values'
        return [
            Evaluator.evaluate(m, y, result, multioutput=multioutput)
            for m in metrics
        ]
Esempio n. 12
0
 def _make_pipeline(self, analysis, feature_transformers, model,
                    remote_dir):
     metric = self.metric
     mode = Evaluator.get_metric_mode(metric)
     best_config = analysis.get_best_config(metric=metric, mode=mode)
     best_logdir = analysis.get_best_logdir(metric=metric, mode=mode)
     print("best log dir is ", best_logdir)
     dataframe = analysis.dataframe(metric=metric, mode=mode)
     # print(dataframe)
     model_path = os.path.join(best_logdir, dataframe["checkpoint"].iloc[0])
     config = convert_bayes_configs(best_config).copy()
     self._print_config(config)
     if remote_dir is not None:
         all_config = restore_hdfs(model_path,
                                   remote_dir,
                                   feature_transformers,
                                   model,
                                   # config)
                                   )
     else:
         all_config = restore_zip(model_path,
                                  feature_transformers,
                                  model,
                                  # config)
                                  )
     return TimeSequencePipeline(name=self.name,
                                 feature_transformers=feature_transformers,
                                 model=model,
                                 config=all_config)
Esempio n. 13
0
    def evaluate_with_onnx(self, x, y, metrics=['mse'], dirname=None, multioutput="raw_values"):
        # reshape 1dim input
        x = self._reshape_input(x)
        y = self._reshape_input(y)

        yhat = self.predict_with_onnx(x, dirname=dirname)
        eval_result = [Evaluator.evaluate(m, y_true=y, y_pred=yhat, multioutput=multioutput)
                       for m in metrics]
        return eval_result
 def evaluate(self, x, y, metric=['mse']):
     yhat = self.predict(x)
     eval_result = [
         Evaluator.evaluate(m,
                            y_true=y,
                            y_pred=yhat,
                            multioutput="raw_values") for m in metric
     ]
     return eval_result
Esempio n. 15
0
 def evaluate(self, x, y, metrics=['mse']):
     """
     Evaluate on x, y
     :param x: input
     :param y: target
     :param metrics: a list of metrics in string format
     :return: a list of metric evaluation results
     """
     y_pred = self.predict(x)
     return [Evaluator.evaluate(m, y, y_pred) for m in metrics]
 def _validate(self, x, y, metric):
     self.model.eval()
     with torch.no_grad():
         yhat = self.model(x)
         val_loss = self.criterion(yhat, y)
         eval_result = Evaluator.evaluate(metric=metric,
                                          y_true=y.numpy(),
                                          y_pred=yhat.numpy(),
                                          multioutput='uniform_average')
     return {"val_loss": val_loss.item(), metric: eval_result}
Esempio n. 17
0
    def evaluate(self,
                 x=None,
                 y=None,
                 metrics=None,
                 target_covariates=None,
                 target_dti=None,
                 num_workers=None):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param x: We don't support input x currently.
        :param y: target. We interpret the second dimension of y as the horizon length for
            evaluation.
        :param metrics: a list of metrics in string format
        :param target_covariates: covariates corresponding to target_value.
            2-D ndarray or None.
            The shape of ndarray should be (r, horizon), where r is the number of covariates.
            Global covariates for all time series. If None, only default time coveriates will be
            used while use_time is True. If not, the time coveriates used is the stack of input
            covariates and default time coveriates.
        :param target_dti: dti corresponding to target_value.
            DatetimeIndex or None.
            If None, use default fixed frequency DatetimeIndex generated with the last date of x in
            fit and freq.
        :param num_workers: the number of workers to use in evaluate. It defaults to 1.
        :return: a list of metric evaluation results
        """
        if x is not None:
            raise ValueError("We don't support input x directly.")
        if y is None:
            raise ValueError("Input invalid y of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )
        if len(y.shape) == 1:
            y = np.expand_dims(y, axis=1)
            horizon = 1
        else:
            horizon = y.shape[1]
        result = self.predict(x=None,
                              horizon=horizon,
                              future_covariates=target_covariates,
                              future_dti=target_dti,
                              num_workers=num_workers)

        if y.shape[1] == 1:
            multioutput = 'uniform_average'
        else:
            multioutput = 'raw_values'
        return [
            Evaluator.evaluate(m, y, result, multioutput=multioutput)
            for m in metrics
        ]
Esempio n. 18
0
    def _detach_recipe(self, recipe):
        self.search_space = recipe.search_space()

        stop = recipe.runtime_params()
        self.metric_threshold = None
        if "reward_metric" in stop.keys():
            self.mode = Evaluator.get_metric_mode(self.metric)
            self.metric_threshold = -stop["reward_metric"] if \
                self.mode == "min" else stop["reward_metric"]
        self.epochs = stop["training_iteration"]
        self.num_samples = stop["num_samples"]
    def evaluate(self, x, y, metrics=['mse']):
        # reshape 1dim input
        x = self._reshape_input(x)
        y = self._reshape_input(y)

        yhat = self.predict(x)
        eval_result = [
            Evaluator.evaluate(m,
                               y_true=y,
                               y_pred=yhat,
                               multioutput="raw_values") for m in metrics
        ]
        return eval_result
Esempio n. 20
0
 def _validate_metric_mode(metric, mode):
     from zoo.automl.common.metrics import Evaluator
     if not mode:
         try:
             mode = Evaluator.get_metric_mode(metric)
         except ValueError:
             pass
     if not mode:
         raise ValueError(f"We cannot infer metric mode with metric name of {metric}. "
                          f"Please specify the `metric_mode` parameter in AutoEstimator.fit().")
     if mode not in ["min", "max"]:
         raise ValueError("`mode` has to be one of ['min', 'max']")
     return mode
Esempio n. 21
0
 def _train(self):
     # print("self.config in train is ", self.config)
     result = self.trial_model.fit_eval(self.x_train, self.y_train,
                                        validation_data=self.validation_data,
                                        # verbose=1,
                                        **self.config)
     self.reward_m = result if Evaluator.get_metric_mode(metric) == "max" else -result
     # if metric == "mean_squared_error":
     #     self.reward_m = (-1) * result
     #     # print("running iteration: ",i)
     # elif metric == "r_square":
     #     self.reward_m = result
     # else:
     #     raise ValueError("metric can only be \"mean_squared_error\" or \"r_square\"")
     return {"reward_metric": self.reward_m, "checkpoint": self.ckpt_name}
Esempio n. 22
0
 def _validate_metric_mode(metric, mode):
     if not mode:
         if callable(metric):
             raise ValueError("You must specify `metric_mode` for your metric function")
         try:
             from zoo.automl.common.metrics import Evaluator
             mode = Evaluator.get_metric_mode(metric)
         except ValueError:
             pass
         if not mode:
             raise ValueError(f"We cannot infer metric mode with metric name of {metric}. Please"
                              f" specify the `metric_mode` parameter in AutoEstimator.fit().")
     if mode not in ["min", "max"]:
         raise ValueError("`mode` has to be one of ['min', 'max']")
     return mode
    def _check_input(self, input_df, validation_df, metric):
        input_is_list = self._check_input_format(input_df)
        if not input_is_list:
            self._check_missing_col(input_df)
            if validation_df is not None:
                self._check_missing_col(validation_df)
        else:
            for d in input_df:
                self._check_missing_col(d)
            if validation_df is not None:
                for val_d in validation_df:
                    self._check_missing_col(val_d)

        if not Evaluator.check_metric(metric):
            raise ValueError("metric " + metric + " is not supported")
Esempio n. 24
0
 def evaluate(self, x, y, metrics=['mse']):
     """
     Evaluate on x, y
     :param x: input
     :param y: target
     :param metric: a list of metrics in string format
     :return: a list of metric evaluation results
     """
     y_pred = self.predict(x)
     if y_pred.shape[1] == 1:
         multioutput = 'uniform_average'
     else:
         multioutput = 'raw_values'
     # y = np.squeeze(y, axis=2)
     return [Evaluator.evaluate(m, y, y_pred, multioutput=multioutput) for m in metrics]
Esempio n. 25
0
 def _validate(self, validation_loader, metric):
     self.model.eval()
     with torch.no_grad():
         yhat_list = []
         y_list = []
         for x_valid_batch, y_valid_batch in validation_loader:
             yhat_list.append(self.model(x_valid_batch).numpy())
             y_list.append(y_valid_batch.numpy())
         yhat = np.concatenate(yhat_list, axis=0)
         y = np.concatenate(y_list, axis=0)
     # val_loss = self.criterion(yhat, y)
     eval_result = Evaluator.evaluate(metric=metric,
                                      y_true=y,
                                      y_pred=yhat,
                                      multioutput='uniform_average')
     return {metric: eval_result}
Esempio n. 26
0
 def evaluate(self, df, metric=['mse']):
     """
     Evaluate on x, y
     :param x: input
     :param y: target
     :param metric: a list of metrics in string format
     :return: a list of metric evaluation results
     """
     if isinstance(metric, str):
         metric = [metric]
     x, y = self._process_data(df, mode="val")
     y_pred = self.model.predict(x)
     y_unscale, y_pred_unscale = self.ft.post_processing(df, y_pred, is_train=True)
     if len(y_pred.shape) > 1 and y_pred.shape[1] == 1:
         multioutput = 'uniform_average'
     else:
         multioutput = 'raw_values'
     return [Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput)
             for m in metric]
Esempio n. 27
0
 def _validate(self, validation_loader, metric_name, metric_func=None):
     if not metric_name:
         assert metric_func, "You must input valid metric_func or metric_name"
         metric_name = metric_func.__name__
     self.model.eval()
     with torch.no_grad():
         yhat_list = []
         y_list = []
         for x_valid_batch, y_valid_batch in validation_loader:
             yhat_list.append(self.model(x_valid_batch).numpy())
             y_list.append(y_valid_batch.numpy())
         yhat = np.concatenate(yhat_list, axis=0)
         y = np.concatenate(y_list, axis=0)
     # val_loss = self.criterion(yhat, y)
     if metric_func:
         eval_result = metric_func(y, yhat)
     else:
         eval_result = Evaluator.evaluate(metric=metric_name,
                                          y_true=y, y_pred=yhat,
                                          multioutput='uniform_average')
     return {metric_name: eval_result}
        def train_func(config):
            train_data = ray.get(data_id)
            val_data = ray.get(validation_data_id)
            config = convert_bayes_configs(config).copy()
            if not isinstance(model_builder, ModelBuilder):
                raise ValueError(f"You must input a ModelBuilder instance for model_builder")
            trial_model = model_builder.build(config)

            # no need to call build since it is called the first time fit_eval is called.
            # callbacks = [TuneCallback(tune_reporter)]
            # fit model
            best_reward = None
            for i in range(1, 101):
                result = trial_model.fit_eval(data=train_data,
                                              validation_data=val_data,
                                              mc=mc,
                                              metric=metric,
                                              **config)
                reward = result
                checkpoint_filename = "best.ckpt"

                # Save best reward iteration
                mode = Evaluator.get_metric_mode(metric)
                if mode == "max":
                    has_best_reward = best_reward is None or reward > best_reward
                else:
                    has_best_reward = best_reward is None or reward < best_reward

                if has_best_reward:
                    best_reward = reward
                    trial_model.save(checkpoint_filename)
                    # Save to hdfs
                    if remote_dir is not None:
                        put_ckpt_hdfs(remote_dir, checkpoint_filename)

                report_dict = {"training_iteration": i,
                               metric: reward,
                               "checkpoint": checkpoint_filename,
                               "best_" + metric: best_reward}
                tune.report(**report_dict)
Esempio n. 29
0
    def evaluate(self, x, y, metrics=['mse']):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param x: We don't support input x currently.
        :param y: target. We interpret the second dimension of y as the horizon length for
            evaluation.
        :param metrics: a list of metrics in string format
        :return: a list of metric evaluation results
        """
        if x is None:
            raise ValueError("Input invalid x of None")
        if y is None:
            raise ValueError("Input invalid y of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )

        y_pred = self.predict(x)
        return [Evaluator.evaluate(m, y, y_pred) for m in metrics]
Esempio n. 30
0
    def evaluate(self, target, data=None, metrics=['mse']):
        """
        Evaluate on the prediction results. We predict horizon time-points ahead the input data
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        target.
        :param data: Prophet predicts the horizon steps foreward from the training data.
            So data should be None as it is not used.
        :param target: target for evaluation.
        :param metrics: a list of metrics in string format
        :return: a list of metric evaluation results
        """
        if data is not None:
            raise ValueError("We don't support input data currently")
        if target is None:
            raise ValueError("Input invalid target of None")
        if self.model is None:
            raise Exception("Needs to call fit_eval or restore first before calling evaluate")

        horizon = len(target)
        future = self.model.make_future_dataframe(periods=horizon)
        target_pred = self.predict(horizon=horizon)[['yhat']]
        return [Evaluator.evaluate(m, target[['y']].values, target_pred.values) for m in metrics]