def cross_val(endog_series, exog_series, model1, model2, model3): cv = model_selection.RollingForecastCV(step=5, h=14, initial=160) model1_cv_scores = model_selection.cross_val_score( model1, y=endog_series, exogenous=exog_series, scoring='mean_absolute_error', cv=cv) model2_cv_scores = model_selection.cross_val_score( model2, y=endog_series, exogenous=exog_series, scoring='mean_absolute_error', cv=cv) model3_cv_scores = model_selection.cross_val_score( model3, y=endog_series, exogenous=exog_series, scoring='mean_absolute_error', cv=cv) # Filter the nan scores model1_cv_scores = model1_cv_scores[~(np.isnan(model1_cv_scores))] model2_cv_scores = model2_cv_scores[~(np.isnan(model2_cv_scores))] model3_cv_scores = model3_cv_scores[~(np.isnan(model3_cv_scores))] # Print score list for each model model1_cv_scoreslist = ["%.4f" % elem for elem in model1_cv_scores] model2_cv_scoreslist = ["%.4f" % elem for elem in model2_cv_scores] model3_cv_scoreslist = ["%.4f" % elem for elem in model3_cv_scores] print("Model 1 CV scores: {}".format(model1_cv_scoreslist)) print("Model 2 CV scores: {}".format(model2_cv_scoreslist)) print("Model 3 CV scores: {}".format(model3_cv_scoreslist)) # Pick model based on which has a lower average error rate m1_average_error = np.average(model1_cv_scores) m2_average_error = np.average(model2_cv_scores) m3_average_error = np.average(model3_cv_scores) errors = [m1_average_error, m2_average_error, m3_average_error] models = [model1, model2, model3] # Print out the best model (where errors is min, discarding nan values): if np.isnan(np.nanmin(errors)): better_index = 0 else: better_index = errors.index(np.nanmin(errors)) best_order = models[better_index].order print("Lowest average MAE: {} (model{})".format(errors[better_index], better_index + 1)) print("Best model order: {}".format(best_order)) return best_order
def _cross_validate_single_model(model, group_series, metrics, cross_validator, error_score, exog, verbosity): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning) if isinstance(metrics, str): metrics = list(metrics) output = {} for metric in metrics: scores = cross_val_score( estimator=model, y=group_series, X=exog, scoring=metric, cv=cross_validator, verbose=verbosity, error_score=error_score, ) output[f"{metric}_mean"] = np.mean(scores) output[f"{metric}_stddev"] = np.std(scores) return output
print("pmdarima version: %s" % pm.__version__) # Load the data and split it into separate pieces data = pm.datasets.load_wineind() train, test = model_selection.train_test_split(data, train_size=165) # Even though we have a dedicated train/test split, we can (and should) still # use cross-validation on our training set to get a good estimate of the model # performance. We can choose which model is better based on how it performs # over various folds. model1 = pm.ARIMA(order=(2, 1, 1), seasonal_order=(0, 0, 0, 1)) model2 = pm.ARIMA(order=(1, 1, 2), seasonal_order=(0, 1, 1, 12)) cv = model_selection.SlidingWindowForecastCV(window_size=100, step=24, h=1) model1_cv_scores = model_selection.cross_val_score( model1, train, scoring='smape', cv=cv, verbose=2) model2_cv_scores = model_selection.cross_val_score( model2, train, scoring='smape', cv=cv, verbose=2) print("Model 1 CV scores: {}".format(model1_cv_scores.tolist())) print("Model 2 CV scores: {}".format(model2_cv_scores.tolist())) # Pick based on which has a lower mean error rate m1_average_error = np.average(model1_cv_scores) m2_average_error = np.average(model2_cv_scores) errors = [m1_average_error, m2_average_error] models = [model1, model2] # print out the answer better_index = np.argmin(errors) # type: int
df {pd.DataFrame} -- DataFrame of races Keyword Arguments: n_forecasts {int} -- number of steps ahead (default: {1}) Returns: str -- The prediction, confidence intervals, and average cross validation error of the next race time """ runner_df = df.loc[df.name.str.contains(name, case=False)] if runner_df.empty: return minutes = runner_df.time.dt.seconds / 60.0 model = pm.auto_arima(minutes, seasonal=False, suppress_warnings=True) pred, conf_int = model.predict(n_forecasts, return_conf_int=True) cv_score = cross_val_score(model, minutes, scoring='mean_absolute_error') mean_cv_score = np.mean(cv_score) def formatter(num: int) -> str: return str(datetime.timedelta(minutes=num)).split('.')[0] pred_format = formatter(pred[0]) conf_int_format = [formatter(x) for x in conf_int[0]] mean_cv_score_format = formatter(mean_cv_score) pred_string = (f'Results for {runner_df.name.unique()[0]}\n' f'The prediction for the next 42 km race' f' is {pred_format} with 95 % confidence' f' interval ({conf_int_format[0]},' f' {conf_int_format[1]})\n' f'The average cross validation error score is' f' {mean_cv_score_format}.\n'
def calculate_cv_metrics(model, endog, metric, cv): cv_metric = model_selection.cross_val_score(model, endog, cv=cv, scoring=metric, verbose=0) return cv_metric[~np.isnan(cv_metric)].mean()