def build_ff_model(): # get the csv file as a dataframe raw_data = pd.read_csv( "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv", header=0, index_col=0) # convert the raw data into an object recognised by GluonTS # start: the starting index of the dataframe # target: the actual time-series data that we want to model # freq: the frequency with which the data is collected train_data = common.ListDataset( [{ "start": raw_data.index[0], "target": raw_data.value[:"2015-04-05 00:00:00"] }], freq="5min") # create an Estimator with simple feed forward model # an object of Trainer() class is used to customize Estimator estimator = simple_feedforward.SimpleFeedForwardEstimator( freq="5min", prediction_length=100, trainer=Trainer(ctx="cpu", epochs=100, learning_rate=1e-3)) # create a Predictor by training the Estimator with training dataset predictor = estimator.train(training_data=train_data) # get predictions for the whole forecast horizon for model_train_data, predictions in zip(train_data, predictor.predict(train_data)): # plot only the last 100 timestamps of the training dataset to_pandas(model_train_data)[-100:].plot() # plot the forecasts from the model predictions.plot(output_file='ff-model.png', color='r')
def prepare_output(self, forecasts, horizon): # Get forecasts depending on horizon forecasts = forecasts[-horizon:] if self.type == 'deepar': # From iterator to pandas train = to_pandas(next(iter(self.y_train))) test = to_pandas(next(iter(self.y_test))) date = test.index.astype(str).tolist() target = test.tolist() prediction = np.full(len(train.index), np.nan).tolist() prediction.extend(forecasts) else: if self.type == 'prophet': if not isinstance(self.y_train, pd.Series): self.y_train = self.y_train.set_index('ds').iloc[:, 0] if not isinstance(self.y_test, pd.Series): self.y_test = self.y_test.set_index('ds').iloc[:, 0] date = self.y_train.index.astype(str).tolist() date.extend(self.y_test.index.astype(str).tolist()) target = self.y_train.tolist() target.extend(self.y_test.tolist()) prediction = np.full(len(self.y_train.index), np.nan).tolist() prediction.extend(forecasts) return date, target, prediction
def build_deepar_model(): # get the financial data "exchange_rate" gluon_data = get_dataset("exchange_rate", regenerate=True) train_data = next(iter(gluon_data.train)) test_data = next(iter(gluon_data.test)) meta_data = gluon_data.metadata # data set visualisation fig, ax = plt.subplots(1, 1, figsize=(11, 8)) to_pandas(train_data).plot(ax=ax) ax.grid(which="both") ax.legend(["train data"], loc="upper left") plt.savefig("dataset.png") # visualize various members of the 'gluon_data.*' print(train_data.keys()) print(test_data.keys()) print(meta_data) # convert dataset into an object recognised by GluonTS training_data = common.ListDataset(gluon_data.train, freq=meta_data.freq) testing_data = common.ListDataset(gluon_data.test, freq=meta_data.freq) # create an Estimator with DeepAR # an object of Trainer() class is used to customize Estimator estimator = deepar.DeepAREstimator( freq=meta_data.freq, prediction_length=meta_data.prediction_length, trainer=Trainer(ctx="cpu", epochs=100, learning_rate=1e-4)) # create a Predictor by training the Estimator with training dataset predictor = estimator.train(training_data=training_data) # make predictions forecasts, test_series = make_evaluation_predictions(dataset=testing_data, predictor=predictor, num_samples=10) # visualise forecasts prediction_intervals = (50.0, 90.0) legend = ["actual data", "median forecast" ] + [f"{k}% forecast interval" for k in prediction_intervals][::-1] fig, ax = plt.subplots(1, 1, figsize=(11, 8)) list(test_series)[0][-150:].plot(ax=ax) # plot the time series list(forecasts)[0].plot(prediction_intervals=prediction_intervals, color='r') plt.grid(which="both") plt.legend(legend, loc="upper left") plt.savefig("deepar-model.png")
def predict(self, dataset: Iterable[Dict]) -> Iterator[SampleForecast]: for entry in dataset: ts = to_pandas(entry) start = ts.index[-1] + pd.tseries.frequencies.to_offset(self.freq) start_timestamp = pd.Timestamp(start, freq=self.freq) future_entry = { "start": start_timestamp, "target": np.array([None] * self.prediction_length), } future_ts = to_pandas(future_entry) df = get_prediction_dataframe(future_ts) ag_output = self.ag_model.predict(df) yield self.to_forecast(ag_output, start_timestamp, entry.get(FieldName.ITEM_ID, None))
def get_dataset(**kw): ##check whether dataset is of kind train or test data_path = kw['train_data_path'] if kw['train'] else kw['test_data_path'] #### read from csv file if kw.get("uri_type") == "pickle": data_set = pd.read_pickle(data_path) else: data_set = pd.read_csv(data_path) ### convert to gluont format gluonts_ds = ListDataset([{ FieldName.TARGET: data_set.iloc[i].values, FieldName.START: kw['start'] } for i in range(kw['num_series'])], freq=kw['freq']) if VERBOSE: entry = next(iter(gluonts_ds)) train_series = to_pandas(entry) train_series.plot() save_fig = kw['save_fig'] plt.savefig(save_fig) return gluonts_ds
def run_example(): dataset = get_dataset("electricity") estimator = TabularEstimator( freq="H", prediction_length=24, time_limits=2 * 60, # two minutes for training disable_auto_regression= True, # makes prediction faster, but potentially less accurate ) n_train = 5 training_data = list(islice(dataset.train, n_train)) predictor = estimator.train(training_data=training_data, ) forecasts = list(predictor.predict(training_data)) for entry, forecast in zip(training_data, forecasts): ts = to_pandas(entry) plt.figure() plt.plot(ts[-7 * predictor.prediction_length:], label="target") forecast.plot() plt.show()
def _predict_batch_autoreg(self, dataset: Iterable[Dict], **kwargs) -> Iterator[SampleForecast]: # TODO clean up # TODO optimize batch_ids = [] batch_scales = [] batch_series = [] for entry in dataset: batch_ids.append(entry.get(FieldName.ITEM_ID, None)) series, scale = self.scaling(to_pandas(entry)) batch_scales.append(scale) batch_series.append(series) batch_forecast_indices = [ pd.date_range( series.index[-1] + series.index.freq, freq=series.index.freq, periods=self.prediction_length, ) for series in batch_series ] batch_full_series = [ series.append( pd.Series( [None] * self.prediction_length, index=forecast_index, )) for series, forecast_index in zip(batch_series, batch_forecast_indices) ] output = np.zeros((len(batch_series), self.prediction_length), dtype=self.dtype) for k in range(self.prediction_length): dfs = [] for fs, idx in zip(batch_full_series, batch_forecast_indices): idx_k = idx[k] dfs.append( get_features_dataframe( fs[idx_k:idx_k], time_features=self.time_features, lag_indices=self.lag_indices, past_data=fs[:idx_k][:-1], )) df = pd.concat(dfs) out_k = self.ag_model.predict(df) output[:, k] = out_k for fs, idx, v in zip(batch_full_series, batch_forecast_indices, out_k): fs.at[idx[k]] = v for arr, scale, forecast_index, item_id in zip(output, batch_scales, batch_forecast_indices, batch_ids): yield self._to_forecast( scale * arr, forecast_index[0], item_id=item_id, )
def run_example(): dataset = get_dataset("electricity") serialize_path = Path("GluonTSTabularPredictor") estimator = TabularEstimator( freq="H", prediction_length=24, time_limit=10, # two minutes for training disable_auto_regression=True, # makes prediction faster, but potentially less accurate last_k_for_val=24, # split the last 24 targets from each time series to be the validation data quantiles_to_predict=None, ) n_train = 5 training_data = list(islice(dataset.train, n_train)) predictor = estimator.train(training_data=training_data) os.makedirs(serialize_path, exist_ok=True) predictor.serialize(serialize_path) predictor = None predictor = Predictor.deserialize(serialize_path) forecasts = list(predictor.predict(training_data)) for entry, forecast in zip(training_data, forecasts): ts = to_pandas(entry) plt.figure() plt.plot(ts[-7 * predictor.prediction_length :], label="target") forecast.plot() plt.show()
def check_consistency(entry, f1, f2): ts = to_pandas(entry) start_timestamp = ts.index[-1] + 1 assert f1.samples.shape == (1, prediction_length) assert f1.start_date == start_timestamp assert f2.samples.shape == (1, prediction_length) assert f2.start_date == start_timestamp assert np.allclose(f1.samples, f2.samples)
def train(self, training_data: Dataset) -> TabularPredictor: # every time there is only one time series passed # list(training_data)[0] is essentially getting the only time series dfs = [ get_prediction_dataframe(to_pandas(entry)) for entry in training_data ] df = pd.concat(dfs) ag_model = self.task.fit(df, label="target") return TabularPredictor(ag_model, self.freq, self.prediction_length)
def multivar_df(ds): df = pd.DataFrame() for i in range(ds["target"].shape[0]): tmp = {} for k in ds: if k == "target": tmp["target"] = ds["target"][i] else: tmp[k] = ds[k] tmp_df = to_pandas(tmp).to_frame().rename(columns={0: f"ts_{i}"}) df = pd.concat([df, tmp_df], axis=1, sort=True) return df.reset_index().rename(columns={"index": "time"})
def train(self, training_data: Dataset) -> TabularPredictor: dfs = [ get_prediction_dataframe(to_pandas(entry)) for entry in training_data ] df = pd.concat(dfs) ag_model = self.task.fit(df, label="target", problem_type="regression", **self.kwargs) return TabularPredictor(ag_model, self.freq, self.prediction_length)
def plot_train_test_dataset_first(dataset): entry = next(iter(dataset.train_ds)) train_series = to_pandas(entry) train_series.plot() plt.grid(which="both") plt.legend(["train series"], loc="upper left") plt.show() entry = next(iter(dataset.test_ds)) test_series = to_pandas(entry) test_series.plot() plt.axvline(train_series.index[-1], color='r') # end of train dataset plt.grid(which="both") plt.legend(["test series", "end of train series"], loc="upper left") plt.show() print(f"Length of forecasting window in test dataset: " f"{len(test_series) - len(train_series)}") print(f"Learning length: " f"{dataset.learning_length}") print(f"Recommended prediction horizon: " f"{dataset.prediction_length}") print(f"Frequency of the time series: {dataset.freq}")
def evaluate_optimal_rec( predictions: pd.DataFrame, test_data: ListDataset, hierarchy_dict: Dict[int, List[int]], ) -> Dict[str, Dict[str, float]]: """ aggregates error metrics for each level of the hierarchy, calculated over data frame of point estimates (for example, those returned after optimal reconciliation) instead of probabilistic forecast objects Arguments: predictions {pd.DataFrame} -- data frame of point predictions test_data {ListDataset} -- test dataset hierarchy_dict {Dict[int, List[int]]} -- mapping from hierachy level to series prediction idxs included in that level of hierarchy Returns: Dict[str, Dict[str, float]] -- mapping of hierarchy level (0-indexed) to dictionaries of aggregated metrics for that level of the hierarchy """ evaluator = PointEstimateEvaluator() evaluations = { level: evaluator( [ to_pandas(series) for series in np.array(list(test_data))[np.array(idxs)] ], predictions.values.T[np.array(idxs)], ) for level, idxs in hierarchy_dict.items() } evaluations['all'] = evaluator( [to_pandas(series) for series in np.array(list(test_data))], predictions.values.T, ) return evaluations
def _predict_batch_one_shot( self, dataset: Iterable[Dict], **kwargs ) -> Iterator[SampleForecast]: # TODO clean up # TODO optimize item_ids = [] scales = [] forecast_start_timestamps = [] dfs = [] for entry in dataset: item_ids.append(entry.get(FieldName.ITEM_ID, None)) series, scale = self.scaling(to_pandas(entry)) scales.append(scale) forecast_start = series.index[-1] + series.index.freq forecast_start_timestamps.append(forecast_start) forecast_index = pd.date_range( forecast_start, freq=series.index.freq, periods=self.prediction_length, ) forecast_series = pd.Series( [None] * self.prediction_length, index=forecast_index, ) dfs.append( get_features_dataframe( forecast_series, time_features=self.time_features, lag_indices=self.lag_indices, past_data=series, ) ) df = pd.concat(dfs) output = self.ag_model.predict(df) for arr, scale, forecast_start, item_id in zip( np.split(output, len(dfs)), scales, forecast_start_timestamps, item_ids, ): yield self._to_forecast( scale * arr, forecast_start, item_id=item_id, )
def compute_time_features( entry: Dict, time_features: List[TimeFeature], pred_length: int = 0, dtype=np.float32, ): assert pred_length >= 0 index = to_pandas(entry, freq=entry["start"].freq).index if pred_length > 0: index = index.union( pd.period_range(index[-1] + 1, index[-1] + pred_length, freq=index.freq)) feature_arrays = [feat(index) for feat in time_features] return np.vstack(feature_arrays).astype(dtype)
def fit_predict_arima( training_data: ListDataset, horizon: int = 12, output_file: str = None, output_residuals: bool = True, ) -> pd.DataFrame: """ for each time series in the training_data individually: 1) automatically discovers the optimal order for a seasonal ARIMA model 2) fits discovered model 3) makes predictions horizon length into the future optionally writes predictions/in-sample residuals to output file Arguments: training_data {ListDataset} -- training data Keyword Arugments: horizon {int} -- prediction length (default: {12}) output_file {str} -- output_file to save predictions (default: {None}) output_residuals {bool} -- whether to output the residuals of in-sample predictions. If True, the in-sample residuals will be prepended to the out-of-sample predictions. Thus, if the in-sample data contains 24 timeteps, and the out-of-sample data contains 6 timesteps, the output data frame will contain 30 rows (timesteps) (default: {True}) Returns: pd.DataFrame -- dataframe of point predictions from individually fitted ARIMA models, each column represents a series and each row a future point in time """ fits = [ pm.auto_arima(to_pandas(train_series), suppress_warnings=True, error_action='ignore') for train_series in list(training_data) ] preds = pd.DataFrame([fit.predict(n_periods=horizon) for fit in fits]).T if output_file: if output_residuals: residuals = pd.DataFrame([ fit.predict_in_sample() - series['target'] for fit, series in zip(fits, training_data) ]).T preds = pd.concat([residuals, preds]) preds.to_csv(output_file, index = False) return preds
def _predict_serial( self, dataset: Iterable[Dict], **kwargs ) -> Iterator[SampleForecast]: for entry in dataset: series, scale = self.scaling(to_pandas(entry)) forecast_index = pd.date_range( series.index[-1] + series.index.freq, freq=series.index.freq, periods=self.prediction_length, ) forecast_series = pd.Series( [None] * len(forecast_index), index=forecast_index, ) full_series = series.append(forecast_series) if not self.auto_regression: # predict all at once df = get_features_dataframe( forecast_series, time_features=self.time_features, lag_indices=self.lag_indices, past_data=series, ) full_series[forecast_series.index] = self.ag_model.predict(df) else: # predict step by step for idx in forecast_series.index: df = get_features_dataframe( forecast_series[idx:idx], time_features=self.time_features, lag_indices=self.lag_indices, past_data=full_series[:idx][:-1], ) full_series[idx] = self.ag_model.predict(df).item() yield self._to_forecast( scale * full_series[forecast_index].values.astype(self.dtype), forecast_index[0], item_id=entry.get(FieldName.ITEM_ID, None), )
def train(self, training_data: Dataset) -> TabularPredictor: dfs = [ get_features_dataframe( series=self.scaling(to_pandas(entry))[0], time_features=self.time_features, lag_indices=self.lag_indices, ) for entry in training_data ] df = pd.concat(dfs) ag_model = self.task.fit(df, label="target", problem_type="regression", **self.kwargs) return TabularPredictor( ag_model=ag_model, freq=self.freq, prediction_length=self.prediction_length, time_features=self.time_features, lag_indices=self.lag_indices, scaling=self.scaling, batch_size=self.batch_size, )
def test_autogluon_tabular(): # create a dataset dataset = ListDataset( [ { "start": pd.Timestamp("1750-01-04 00:00:00", freq="W-SUN"), "target": np.array( [1089.2, 1078.91, 1099.88, 35790.55, 34096.95, 34906.95 ], ), }, { "start": pd.Timestamp("1750-01-04 00:00:00", freq="W-SUN"), "target": np.array( [1099.2, 1098.91, 1069.88, 35990.55, 34076.95, 34766.95 ], ), }, ], freq="W-SUN", ) prediction_length = 2 freq = "W-SUN" predictor = LocalTabularPredictor( freq=freq, prediction_length=prediction_length, ) forecasts_it = predictor.predict(dataset) forecasts = list(forecasts_it) for entry, forecast in zip(dataset, forecasts): ts = to_pandas(entry) start_timestamp = ts.index[-1] + pd.tseries.frequencies.to_offset(freq) assert forecast.samples.shape[1] == prediction_length assert forecast.start_date == start_timestamp return forecasts
def __init__(self, dataset=None, custom_dataset=None, start=None, freq=None, prediction_length=None, learning_length=None, context_length=100, cardinality=None): if dataset is not None: self.learning_length = len(to_pandas(next(iter(dataset.train)))) self.prediction_length = dataset.metadata.prediction_length self.freq = dataset.metadata.freq self.test_ds = dataset.test self.train_ds = dataset.train self.context_length = context_length self.cardinality = list([1]) elif custom_dataset is not None: self.freq = freq self.start = start self.learning_length = learning_length self.prediction_length = prediction_length self.context_length = context_length self.cardinality = cardinality # train dataset: cut the last window of length "prediction_length", # add "target" and "start" fields self.train_ds = ListDataset([{ 'target': x, 'start': start } for x in custom_dataset[:, :-prediction_length]], freq=freq) # test dataset: use the whole dataset, add "target" and "start" self.test_ds = ListDataset([{ 'target': x, 'start': start } for x in custom_dataset], freq=freq)
import pandas as pd import matplotlib.pyplot as plt csv_path = '/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/Twitter_volume_AMZN.csv' df = pd.read_csv(csv_path,header=0,sep=',') df['timestamp'] = pd.to_datetime(df['timestamp']) df.set_index(['timestamp'],inplace=True) # print(df.value[:"2015-04-22 20:47:53"]) # 最后的时间戳是包含[2015-04-22 20:47:53] # print(df.value[:"2015-04-23 20:47:53"]) # 如果所给时间戳超出了数据的范围的时候就会输出有的数据 # print("开始时间戳", df.index[0]) # start是开始的时间戳,target对应的是对应时间戳的序列信息 data = common.ListDataset([{'start': df.index[0], 'target': df.value[:"2015-04-22 21:00:00"]}], freq='H')#这个数据格式是固定的 # 这里df.index是时间戳,df.value是时间戳对应的值 estimator = deepar.DeepAREstimator( freq='H', prediction_length=24, trainer=Trainer(epochs=50) ) predictor = estimator.train(training_data=data) predictor.serialize(Path("/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/model_save")) for train_entry, predict_result in zip(data, predictor.predict(data)): to_pandas(train_entry)[-60:].plot(linewidth=2) predict_result.plot(color='g', prediction_intervals=[50.0, 90.0]) plt.grid(which='both') plt.show() ##输出预测结果 prediction = next(predictor.predict(data)) print(prediction.mean) prediction.plot(output_file='graph.png')
def generate_rolling_dataset( dataset: Dataset, strategy, start_time: pd.Timestamp, end_time: Optional[pd.Timestamp] = None, ) -> Dataset: """ Returns an augmented version of the input dataset where each timeseries has been rolled upon based on the parameters supplied. Below follows an explanation and examples of how the different parameters can be used to generate differently rolled datasets. The *rolling* happens on the data available in the provided window between the *start_time* and the *end_time* for each timeseries. If *end_time* is omitted, rolling happens on all datapoints from *start_time* until the end of the timeseries. The way the data is rolled is governed by the strategy used. Below examples will be based on this one timeseries long dataset >>> ds = [{ ... "target": np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]), ... "start": pd.Timestamp('2000-1-1-01', freq='1H') ... }] applying generate_rolling_dataset on this dataset like: >>> rolled = generate_rolling_dataset( ... dataset=ds, ... strategy = StepStrategy(prediction_length=2), ... start_time = pd.Timestamp('2000-1-1-06', '1H'), ... end_time = pd.Timestamp('2000-1-1-10', '1H') ... ) Results in a new dataset as follows (only target values shown for brevity): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n [1, 2, 3, 4, 5, 6, 7, 8, 9]\n [1, 2, 3, 4, 5, 6, 7, 8]\n [1, 2, 3, 4, 5, 6, 7]\n i.e. maximum amount of rolls possible between the *end_time* and *start_time*. The StepStrategy only cuts the last value of the target for as long as there is enough values after *start_time* to perform predictions on. When no end time is provided the output is as below since all datapoints from *start_time* will be rolled over. [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n [1, 2, 3, 4, 5, 6, 7, 8, 9]\n [1, 2, 3, 4, 5, 6, 7, 8]\n [1, 2, 3, 4, 5, 6, 7] One can change the step_size of the strategy as below: >>> strategy = StepStrategy(prediction_length=2, step_size=2) This causes fewer values to be in the output which, when prediction_length matches step_size, ensures that each prediction will be done on unique/new data. Below is the output when the above strategy is used. [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n [1, 2, 3, 4, 5, 6, 7, 8] Not setting an end time and using the step_size=2 results in the below dataset. [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\n [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n [1, 2, 3, 4, 5, 6, 7, 8, 9]\n [1, 2, 3, 4, 5, 6, 7] Parameters ---------- dataset Dataset to generate the rolling forecasting datasets from strategy The strategy that is to be used when rolling start_time The start of the window where rolling forecasts should be applied end_time The end time of the window where rolling should be applied Returns ---------- Dataset The augmented dataset """ assert dataset, "a dataset to perform rolling evaluation on is needed" assert start_time, "a pandas Timestamp object is needed for the start time" assert strategy, """a strategy to use when rolling is needed, for example gluonts.dataset.rolling_dataset.StepStrategy""" if end_time: assert end_time > start_time, "end time has to be after the start time" ds = [] for item in dataset: series = to_pandas(item, start_time.freq) base = series[:start_time][:-1].to_numpy() prediction_window = series[start_time:end_time] for window in strategy.get_windows(prediction_window): new_item = item.copy() new_item[FieldName.TARGET] = np.concatenate( [base, window.to_numpy()] ) new_item = truncate_features( new_item, len(new_item[FieldName.TARGET]) ) ds.append(new_item) return ds
import mxnet as mx from mxnet import gluon import numpy as np import pandas as pd import matplotlib.pyplot as plt import json from gluonts.dataset.repository.datasets import get_dataset, dataset_recipes from gluonts.dataset.util import to_pandas print(f"Available datasets: {list(dataset_recipes.keys())}") dataset = get_dataset("m4_hourly", regenerate=False) entry = next(iter(dataset.train)) train_series = to_pandas(entry) train_series.plot() plt.grid(which="both") plt.legend(["train series"], loc="upper left") plt.show() entry = next(iter(dataset.test)) test_series = to_pandas(entry) test_series.plot() plt.axvline(train_series.index[-1], color='r') # end of train dataset plt.grid(which="both") plt.legend(["test series", "end of train series"], loc="upper left") plt.show() from gluonts.model.simple_feedforward import SimpleFeedForwardEstimator from gluonts.trainer import Trainer
def train( self, training_data: Dataset, validation_data: Optional[Dataset] = None, ) -> TabularPredictor: kwargs_override = {} dfs = [ get_features_dataframe( series=self.scaling(to_pandas(entry))[0], time_features=self.time_features, lag_indices=self.lag_indices, ) for entry in training_data ] if validation_data is not None or self.last_k_for_val is not None: kwargs_override["auto_stack"] = False logger.warning( "Auto Stacking is turned off " "as validation dataset is provided before input into Tabular Predictor." ) if validation_data is not None: logger.log(20, "Validation dataset is directly provided.") validation_dfs = [ get_features_dataframe( series=self.scaling(to_pandas(entry))[0], time_features=self.time_features, lag_indices=self.lag_indices, ) for entry in validation_data ] train_df = pd.concat(dfs) val_df = pd.concat(validation_dfs) elif self.last_k_for_val is not None: logger.log( 20, f"last_k_for_val is provided, choosing last {self.last_k_for_val} of each time series as validation set.", ) train_dfs = [ tmp_df.iloc[:-self.last_k_for_val, :] for tmp_df in dfs ] validation_dfs = [ tmp_df.iloc[-self.last_k_for_val:, :] for tmp_df in dfs ] train_df = pd.concat(train_dfs) val_df = pd.concat(validation_dfs) else: logger.log( 20, "No validation dataset is provided, will let TabularPredictor do the splitting automatically," "Note that this might break the time order of time series data.", ) train_df = pd.concat(dfs) val_df = None if self.quantiles_to_predict is not None: ag_model = AutogluonTabularPredictor( label="target", problem_type="quantile", quantile_levels=self.quantiles_to_predict, ).fit( train_df, tuning_data=val_df, **{ **self.kwargs, **kwargs_override }, ) else: ag_model = AutogluonTabularPredictor( label="target", problem_type="regression", eval_metric=self.eval_metric, ).fit( train_df, tuning_data=val_df, **{ **self.kwargs, **kwargs_override }, ) return TabularPredictor( ag_model=ag_model, freq=self.freq, prediction_length=self.prediction_length, time_features=self.time_features, lag_indices=self.lag_indices, scaling=self.scaling, batch_size=self.batch_size, quantiles_to_predict=self.quantiles_to_predict, )
def quick_start_tutorial(): # Provided datasets. print(f"Available datasets: {list(dataset_recipes.keys())}") dataset = get_dataset("m4_hourly", regenerate=True) entry = next(iter(dataset.train)) plt.figure() train_series = to_pandas(entry) train_series.plot() plt.grid(which="both") plt.legend(["train series"], loc="upper left") entry = next(iter(dataset.test)) plt.figure() test_series = to_pandas(entry) test_series.plot() plt.axvline(train_series.index[-1], color="r") # End of train dataset. plt.grid(which="both") plt.legend(["test series", "end of train series"], loc="upper left") plt.show() #-------------------- # Custom datasets. N = 10 # Number of time series. T = 100 # Number of timesteps. prediction_length = 24 freq = "1H" custom_dataset = np.random.normal(size=(N, T)) start = pd.Timestamp("01-01-2019", freq=freq) # Can be different for each time series. # Train dataset: cut the last window of length "prediction_length", add "target" and "start" fields. train_ds = ListDataset( [{"target": x, "start": start} for x in custom_dataset[:, :-prediction_length]], freq=freq ) # Test dataset: use the whole dataset, add "target" and "start" fields. test_ds = ListDataset( [{"target": x, "start": start} for x in custom_dataset], freq=freq ) #-------------------- # Training an existing model (Estimator). estimator = SimpleFeedForwardEstimator( num_hidden_dimensions=[10], prediction_length=dataset.metadata.prediction_length, context_length=100, freq=dataset.metadata.freq, trainer=Trainer( ctx="cpu", epochs=5, learning_rate=1e-3, num_batches_per_epoch=100 ) ) predictor = estimator.train(dataset.train) #-------------------- # Visualize and evaluate forecasts. forecast_it, ts_it = make_evaluation_predictions( dataset=dataset.test, # Test dataset. predictor=predictor, # Predictor. num_samples=100, # Number of sample paths we want for evaluation. ) forecasts = list(forecast_it) tss = list(ts_it) # First entry of the time series list. ts_entry = tss[0] # First 5 values of the time series (convert from pandas to numpy). print(np.array(ts_entry[:5]).reshape(-1,)) # First entry of dataset.test. dataset_test_entry = next(iter(dataset.test)) # First 5 values. print(dataset_test_entry["target"][:5]) # First entry of the forecast list. forecast_entry = forecasts[0] print(f"Number of sample paths: {forecast_entry.num_samples}") print(f"Dimension of samples: {forecast_entry.samples.shape}") print(f"Start date of the forecast window: {forecast_entry.start_date}") print(f"Frequency of the time series: {forecast_entry.freq}") print(f"Mean of the future window:\n {forecast_entry.mean}") print(f"0.5-quantile (median) of the future window:\n {forecast_entry.quantile(0.5)}") def plot_prob_forecasts(ts_entry, forecast_entry): plot_length = 150 prediction_intervals = (50.0, 90.0) legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1] fig, ax = plt.subplots(1, 1, figsize=(10, 7)) ts_entry[-plot_length:].plot(ax=ax) # Plot the time series. forecast_entry.plot(prediction_intervals=prediction_intervals, color="g") plt.grid(which="both") plt.legend(legend, loc="upper left") plt.show() plot_prob_forecasts(ts_entry, forecast_entry) evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test)) print(json.dumps(agg_metrics, indent=4)) print(item_metrics.head()) item_metrics.plot(x="MSIS", y="MASE", kind="scatter") plt.grid(which="both") plt.show()
def cv_harness_trainer(self): list_y_pred = [] list_score = [] models = self.model type_time_series = self.type_time_series train_data = self.train_data val_data = self.val_data n_splits = self.n_splits out_path = self.out_path freq = self.freq prediction_length = self.prediction_length start_date = self.start_date train_entry = next(iter(train_data)) val_entry = next(iter(val_data)) for models_ in models.get(type_time_series): for key, value in models_.items(): model_name = key model = value[0] model_kwarg = value[1] self.__log.info( "Starting generic cv train loop for Type_of_time_series={}, mlmodel={}, modelkwarg={}" .format(type_time_series, model_name, model_kwarg)) self.__log.info("Regression") splits = TimeSeriesSplit(n_splits=n_splits) X_train = to_pandas(train_entry).values X_val = to_pandas(val_entry).values # start the cross validation loop self._internal_cv_trainer( models=model, model_name=model_name, X_train=X_train, X_test=X_test, ss=splits, n_splits=n_splits, list_y_pred=list_y_pred, list_score=list_score, out_path=out_path, start_date=start_date, prediction_length=prediction_length, freq=freq, type_time_series=type_time_series, ) # output the dataframe of predicted vals with index as sample numbers y_pred_list_df = pd.concat(list_y_pred, axis=1) # one liner to remove duplicate columns y_pred_list_df = y_pred_list_df.loc[:, ~y_pred_list_df.columns. duplicated()] y_pred_list_df.set_index("index", inplace=True) y_pred_list_df_path_csv = os.path.join( out_path, "saved_models", "y_pred_list_df.csv", ) y_pred_list_df.to_csv(y_pred_list_df_path_csv) # output the dataframe of scores with index as sample numbers score_list_df = pd.concat(list_score, axis=1) # one liner to remove duplicate columns score_list_df = score_list_df.loc[:, ~score_list_df.columns. duplicated()] score_list_df_path_csv = os.path.join( out_path, "saved_models", str(type_time_series + "_" + model_name + "_" + "regression" + "_score_list_df.csv")) # print(score_list_df) # print(score_list_df_path_csv) score_list_df.to_csv(score_list_df_path_csv) return y_pred_list_df, score_list_df
def _internal_cv_trainer( self, model_name: str, models: object, type_time_series: str, X: np.array, ss: object, n_splits: int, list_y_pred: list, list_score: list, out_path: str, freq: int, prediction_length: int, start_date: str, ): """ Cross validation training loop for an individual regression models. Parameters ---------- :param: ``X`` : ``np.array`` Array of time series data. :param: ``model_name`` : ``str`` Machine learning model type. :param: ``models`` : ``regression model class object`` Within the training loop, an instantiated model is passed to this method. :param: ``type_time_series`` : ``dict`` Type of time series to train model. eg teams or by position :param: ``prediction_length`` : ``str`` Length of the prediction horizon :param: ``start`` : ``np.array`` Start date of the time series :param: ``freq`` : ``str eg. '1D', '2H', '3S'...`` Frequency of the data to train on and predict :param: ``ss`` : ``scikit-learn split iterator object`` This is an instantiated split iterator object to control time series cross validation splitting within the cross validator. :param: ``n_splits`` : ``int`` The number of splits to divide the data into during cross validation. :param: ``list_y_pred`` : ``list`` A tracking list entitity for the predicted values within each model cross-validation loop. :param: ``list_score`` : ``list`` A tracking list entitity for the scored values within each model cross-validation loop. :param: ``out_path`` : ``str`` his is the filepath of generated models and scores. Returns ------- :return: ``None`` """ split_num = np.int(0) y_test_indices = [] y_pred_df = pd.DataFrame() y_pred_ = [] y_true_ = [] score_df = pd.DataFrame() score_1 = [] score_2 = [] for train_index, test_index in ss.split(X=X): split_num += 1 self.__log.info("%%--%%") self.__log.info("Cross fold: %i of %i", split_num, n_splits) # a workaround made here as the test set after split is not utilised. The training set is split dataset, X_test = X[train_index], X[test_index] start = pd.Timestamp(start_date, freq) X_train = ListDataset([{ 'target': x, 'start': start } for x in dataset.reshape(1, -1)[:, :-prediction_length]], freq='1H') X_test = ListDataset([{ 'target': x, 'start': start } for x in dataset.reshape(1, -1)], freq='1H') y_pred_temp, score_1_temp, score_2_temp = self._cv_train_model_other( models=models, # model_kwarg=model_kwarg, X_train=X_train, X_test=X_test, split_num=split_num, out_path=out_path, model_name=model_name, type_time_series=type_time_series) self.__log.info("Score_1_temp={}".format(score_1_temp)) dataset_ = next(iter(X_test)) dataset_pd = to_pandas(dataset_) y_test = dataset_pd[-prediction_length:].index y_test_indices.append(y_test.values) y_pred_.append(y_pred_temp) y_true_.append(dataset_pd.loc[y_test].values) # keep track of the scores during loops score_1.append([score_1_temp]) score_2.append([score_2_temp]) y_pred_df[str(model_name + "_" + "regression")] = np.concatenate(y_pred_).ravel() y_pred_df["index"] = np.concatenate(y_test_indices).ravel() y_pred_df[str("y_true_val")] = np.concatenate(y_true_).ravel() score_df[str(model_name + "_" + "regression" + "_" + "mse")] = np.concatenate(score_1).ravel() score_df[str(model_name + "_" + "regression" + "_" + "rmse")] = np.concatenate(score_2).ravel() score_df[str(model_name + "_" + "regression" + "_" + "global_mse_ave")] = score_df[str(model_name + "_" + "regression" + "_" + "mse")].mean() score_df[str(model_name + "_" + "regression" + "_" + "global_mse_med")] = score_df[str(model_name + "_" + "regression" + "_" + "rmse")].median() score_df[str(model_name + "_" + "regression" + "_" + "global_mse_std")] = score_df[str(model_name + "_" + "regression" + "_" + "mse")].std() score_df[str(model_name + "_" + "regression" + "_" + "global_rmse_ave")] = score_df[str(model_name + "_" + "regression" + "_" + "rmse")].mean() score_df[str(model_name + "_" + "regression" + "_" + "global_rmse_med")] = score_df[str(model_name + "_" + "regression" + "_" + "rmse")].median() score_df[str(model_name + "_" + "regression" + "_" + "global_rmse_std")] = score_df[str(model_name + "_" + "regression" + "_" + "rmse")].std() path_score_list_path_csv = os.path.join( out_path, "saved_models", str( str(type_time_series) + "_" + str(model_name) + "_" + str(split_num) + "_" + "regression" + "_" + "_cv_score_list.csv"), ) score_df.to_csv(path_score_list_path_csv) list_y_pred.append(y_pred_df) list_score.append(score_df)
) # Create testing datatset testing_data_plots = ListDataset( [{"start": test_data[target_asset].index[0], "target": test_data[target_asset]}], freq = "1d" ) #Create the estimator and train estimator = DeepAREstimator(freq="1d", prediction_length=pred_len, trainer=Trainer(epochs=100)) predictor = estimator.train(training_data=training_data) ### OPTIONAL PLOT PREDICTION RESULTS #Forecast for test_entry, forecast in zip(testing_data_plots, predictor.predict(testing_data)): to_pandas(test_entry)[-60:].plot(linewidth=2) forecast.plot(color='g', prediction_intervals=[50.0, 90.0]) ### GENERATE FORECASTS from gluonts.evaluation import Evaluator from gluonts.evaluation.backtest import make_evaluation_predictions forecast_it, ts_it = make_evaluation_predictions( dataset=testing_data, # test dataset predictor=predictor, # predictor num_samples=100, # number of sample paths we want for evaluation ) ###### VERY SIMPLE TRADING STRATEGY # Signal: If Forecast > Current Price, Buy " < " , Short
freq="H") estimator = deepar.DeepAREstimator(prediction_length=24, context_length=100, use_feat_static_cat=True, use_feat_dynamic_real=True, num_parallel_samples=100, cardinality=[2, 1], freq="H", trainer=Trainer(ctx="cpu", epochs=200, learning_rate=1e-3)) predictor = estimator.train(training_data=train_data) for test_entry, forecast in zip(test_data, predictor.predict(test_data)): to_pandas(test_entry)[-100:].plot(figsize=(12, 5), linewidth=2) forecast.plot(color='g', prediction_intervals=[50.0, 90.0]) plt.grid(which='both') plt.legend([ "past observations", "median prediction", "90% prediction interval", "50% prediction interval" ]) plt.show() prediction = next(predictor.predict(test_data)) print(prediction.mean) prediction.plot(output_file='graph.png') predictor.serialize( Path("/home/root/mxnetTS/GluonTS-Learning-in-Action/chapter-2/model")) # predictor = Predictor.deserialize(Path("/home/root/mxnetTS/GluonTS-Learning-in-Action/chapter-2/model"))