Exemple #1
0
def build_ff_model():
    # get the csv file as a dataframe
    raw_data = pd.read_csv(
        "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_AMZN.csv",
        header=0,
        index_col=0)

    # convert the raw data into an object recognised by GluonTS
    # start: the starting index of the dataframe
    # target: the actual time-series data that we want to model
    # freq: the frequency with which the data is collected
    train_data = common.ListDataset(
        [{
            "start": raw_data.index[0],
            "target": raw_data.value[:"2015-04-05 00:00:00"]
        }],
        freq="5min")

    # create an Estimator with simple feed forward model
    # an object of Trainer() class is used to customize Estimator
    estimator = simple_feedforward.SimpleFeedForwardEstimator(
        freq="5min",
        prediction_length=100,
        trainer=Trainer(ctx="cpu", epochs=100, learning_rate=1e-3))

    # create a Predictor by training the Estimator with training dataset
    predictor = estimator.train(training_data=train_data)

    # get predictions for the whole forecast horizon
    for model_train_data, predictions in zip(train_data,
                                             predictor.predict(train_data)):
        # plot only the last 100 timestamps of the training dataset
        to_pandas(model_train_data)[-100:].plot()
        # plot the forecasts from the model
        predictions.plot(output_file='ff-model.png', color='r')
Exemple #2
0
    def prepare_output(self, forecasts, horizon):
        # Get forecasts depending on horizon
        forecasts = forecasts[-horizon:]

        if self.type == 'deepar':
            # From iterator to pandas
            train = to_pandas(next(iter(self.y_train)))
            test = to_pandas(next(iter(self.y_test)))

            date = test.index.astype(str).tolist()

            target = test.tolist()

            prediction = np.full(len(train.index), np.nan).tolist()
            prediction.extend(forecasts)

        else:
            if self.type == 'prophet':
                if not isinstance(self.y_train, pd.Series):
                    self.y_train = self.y_train.set_index('ds').iloc[:, 0]
                if not isinstance(self.y_test, pd.Series):
                    self.y_test = self.y_test.set_index('ds').iloc[:, 0]

            date = self.y_train.index.astype(str).tolist()
            date.extend(self.y_test.index.astype(str).tolist())

            target = self.y_train.tolist()
            target.extend(self.y_test.tolist())

            prediction = np.full(len(self.y_train.index), np.nan).tolist()
            prediction.extend(forecasts)

        return date, target, prediction
Exemple #3
0
def build_deepar_model():
    # get the financial data "exchange_rate"
    gluon_data = get_dataset("exchange_rate", regenerate=True)
    train_data = next(iter(gluon_data.train))
    test_data = next(iter(gluon_data.test))
    meta_data = gluon_data.metadata

    # data set visualisation
    fig, ax = plt.subplots(1, 1, figsize=(11, 8))
    to_pandas(train_data).plot(ax=ax)
    ax.grid(which="both")
    ax.legend(["train data"], loc="upper left")
    plt.savefig("dataset.png")

    # visualize various members of the 'gluon_data.*'
    print(train_data.keys())
    print(test_data.keys())
    print(meta_data)

    # convert dataset into an object recognised by GluonTS
    training_data = common.ListDataset(gluon_data.train, freq=meta_data.freq)
    testing_data = common.ListDataset(gluon_data.test, freq=meta_data.freq)

    # create an Estimator with DeepAR
    # an object of Trainer() class is used to customize Estimator
    estimator = deepar.DeepAREstimator(
        freq=meta_data.freq,
        prediction_length=meta_data.prediction_length,
        trainer=Trainer(ctx="cpu", epochs=100, learning_rate=1e-4))

    # create a Predictor by training the Estimator with training dataset
    predictor = estimator.train(training_data=training_data)

    # make predictions
    forecasts, test_series = make_evaluation_predictions(dataset=testing_data,
                                                         predictor=predictor,
                                                         num_samples=10)

    # visualise forecasts
    prediction_intervals = (50.0, 90.0)
    legend = ["actual data", "median forecast"
              ] + [f"{k}% forecast interval"
                   for k in prediction_intervals][::-1]
    fig, ax = plt.subplots(1, 1, figsize=(11, 8))
    list(test_series)[0][-150:].plot(ax=ax)  # plot the time series
    list(forecasts)[0].plot(prediction_intervals=prediction_intervals,
                            color='r')
    plt.grid(which="both")
    plt.legend(legend, loc="upper left")
    plt.savefig("deepar-model.png")
Exemple #4
0
 def predict(self, dataset: Iterable[Dict]) -> Iterator[SampleForecast]:
     for entry in dataset:
         ts = to_pandas(entry)
         start = ts.index[-1] + pd.tseries.frequencies.to_offset(self.freq)
         start_timestamp = pd.Timestamp(start, freq=self.freq)
         future_entry = {
             "start": start_timestamp,
             "target": np.array([None] * self.prediction_length),
         }
         future_ts = to_pandas(future_entry)
         df = get_prediction_dataframe(future_ts)
         ag_output = self.ag_model.predict(df)
         yield self.to_forecast(ag_output, start_timestamp,
                                entry.get(FieldName.ITEM_ID, None))
Exemple #5
0
def get_dataset(**kw):
    ##check whether dataset is of kind train or test
    data_path = kw['train_data_path'] if kw['train'] else kw['test_data_path']

    #### read from csv file
    if kw.get("uri_type") == "pickle":
        data_set = pd.read_pickle(data_path)
    else:
        data_set = pd.read_csv(data_path)

    ### convert to gluont format
    gluonts_ds = ListDataset([{
        FieldName.TARGET: data_set.iloc[i].values,
        FieldName.START: kw['start']
    } for i in range(kw['num_series'])],
                             freq=kw['freq'])

    if VERBOSE:
        entry = next(iter(gluonts_ds))
        train_series = to_pandas(entry)
        train_series.plot()
        save_fig = kw['save_fig']
        plt.savefig(save_fig)

    return gluonts_ds
Exemple #6
0
def run_example():
    dataset = get_dataset("electricity")

    estimator = TabularEstimator(
        freq="H",
        prediction_length=24,
        time_limits=2 * 60,  # two minutes for training
        disable_auto_regression=
        True,  # makes prediction faster, but potentially less accurate
    )

    n_train = 5

    training_data = list(islice(dataset.train, n_train))

    predictor = estimator.train(training_data=training_data, )

    forecasts = list(predictor.predict(training_data))

    for entry, forecast in zip(training_data, forecasts):
        ts = to_pandas(entry)
        plt.figure()
        plt.plot(ts[-7 * predictor.prediction_length:], label="target")
        forecast.plot()
        plt.show()
Exemple #7
0
    def _predict_batch_autoreg(self, dataset: Iterable[Dict],
                               **kwargs) -> Iterator[SampleForecast]:
        # TODO clean up
        # TODO optimize
        batch_ids = []
        batch_scales = []
        batch_series = []

        for entry in dataset:
            batch_ids.append(entry.get(FieldName.ITEM_ID, None))
            series, scale = self.scaling(to_pandas(entry))
            batch_scales.append(scale)
            batch_series.append(series)

        batch_forecast_indices = [
            pd.date_range(
                series.index[-1] + series.index.freq,
                freq=series.index.freq,
                periods=self.prediction_length,
            ) for series in batch_series
        ]

        batch_full_series = [
            series.append(
                pd.Series(
                    [None] * self.prediction_length,
                    index=forecast_index,
                )) for series, forecast_index in zip(batch_series,
                                                     batch_forecast_indices)
        ]

        output = np.zeros((len(batch_series), self.prediction_length),
                          dtype=self.dtype)

        for k in range(self.prediction_length):
            dfs = []
            for fs, idx in zip(batch_full_series, batch_forecast_indices):
                idx_k = idx[k]
                dfs.append(
                    get_features_dataframe(
                        fs[idx_k:idx_k],
                        time_features=self.time_features,
                        lag_indices=self.lag_indices,
                        past_data=fs[:idx_k][:-1],
                    ))
            df = pd.concat(dfs)
            out_k = self.ag_model.predict(df)
            output[:, k] = out_k
            for fs, idx, v in zip(batch_full_series, batch_forecast_indices,
                                  out_k):
                fs.at[idx[k]] = v

        for arr, scale, forecast_index, item_id in zip(output, batch_scales,
                                                       batch_forecast_indices,
                                                       batch_ids):
            yield self._to_forecast(
                scale * arr,
                forecast_index[0],
                item_id=item_id,
            )
Exemple #8
0
def run_example():
    dataset = get_dataset("electricity")
    serialize_path = Path("GluonTSTabularPredictor")
    estimator = TabularEstimator(
        freq="H",
        prediction_length=24,
        time_limit=10,  # two minutes for training
        disable_auto_regression=True,  # makes prediction faster, but potentially less accurate
        last_k_for_val=24,  # split the last 24 targets from each time series to be the validation data
        quantiles_to_predict=None,
    )

    n_train = 5

    training_data = list(islice(dataset.train, n_train))

    predictor = estimator.train(training_data=training_data)

    os.makedirs(serialize_path, exist_ok=True)
    predictor.serialize(serialize_path)
    predictor = None
    predictor = Predictor.deserialize(serialize_path)
    forecasts = list(predictor.predict(training_data))

    for entry, forecast in zip(training_data, forecasts):
        ts = to_pandas(entry)
        plt.figure()
        plt.plot(ts[-7 * predictor.prediction_length :], label="target")
        forecast.plot()
        plt.show()
Exemple #9
0
 def check_consistency(entry, f1, f2):
     ts = to_pandas(entry)
     start_timestamp = ts.index[-1] + 1
     assert f1.samples.shape == (1, prediction_length)
     assert f1.start_date == start_timestamp
     assert f2.samples.shape == (1, prediction_length)
     assert f2.start_date == start_timestamp
     assert np.allclose(f1.samples, f2.samples)
Exemple #10
0
 def train(self, training_data: Dataset) -> TabularPredictor:
     # every time there is only one time series passed
     # list(training_data)[0] is essentially getting the only time series
     dfs = [
         get_prediction_dataframe(to_pandas(entry))
         for entry in training_data
     ]
     df = pd.concat(dfs)
     ag_model = self.task.fit(df, label="target")
     return TabularPredictor(ag_model, self.freq, self.prediction_length)
Exemple #11
0
def multivar_df(ds):
    df = pd.DataFrame()
    for i in range(ds["target"].shape[0]):
        tmp = {}
        for k in ds:
            if k == "target":
                tmp["target"] = ds["target"][i]
            else:
                tmp[k] = ds[k]
        tmp_df = to_pandas(tmp).to_frame().rename(columns={0: f"ts_{i}"})
        df = pd.concat([df, tmp_df], axis=1, sort=True)

    return df.reset_index().rename(columns={"index": "time"})
Exemple #12
0
    def train(self, training_data: Dataset) -> TabularPredictor:
        dfs = [
            get_prediction_dataframe(to_pandas(entry))
            for entry in training_data
        ]
        df = pd.concat(dfs)

        ag_model = self.task.fit(df,
                                 label="target",
                                 problem_type="regression",
                                 **self.kwargs)

        return TabularPredictor(ag_model, self.freq, self.prediction_length)
Exemple #13
0
def plot_train_test_dataset_first(dataset):
    entry = next(iter(dataset.train_ds))
    train_series = to_pandas(entry)
    train_series.plot()
    plt.grid(which="both")
    plt.legend(["train series"], loc="upper left")
    plt.show()

    entry = next(iter(dataset.test_ds))
    test_series = to_pandas(entry)
    test_series.plot()
    plt.axvline(train_series.index[-1], color='r')  # end of train dataset
    plt.grid(which="both")
    plt.legend(["test series", "end of train series"], loc="upper left")
    plt.show()

    print(f"Length of forecasting window in test dataset: "
          f"{len(test_series) - len(train_series)}")
    print(f"Learning length: "
          f"{dataset.learning_length}")
    print(f"Recommended prediction horizon: "
          f"{dataset.prediction_length}")
    print(f"Frequency of the time series: {dataset.freq}")
def evaluate_optimal_rec(
    predictions: pd.DataFrame,
    test_data: ListDataset,
    hierarchy_dict: Dict[int, List[int]],
) -> Dict[str, Dict[str, float]]:
    """ aggregates error metrics for each level of the hierarchy, calculated over data frame of point
        estimates (for example, those returned after optimal reconciliation) instead of probabilistic 
        forecast objects

    Arguments:
        predictions {pd.DataFrame} -- data frame of point predictions
        test_data {ListDataset} -- test dataset
        hierarchy_dict {Dict[int, List[int]]} -- mapping from hierachy level to series prediction idxs included
            in that level of hierarchy

    Returns:
        Dict[str, Dict[str, float]] -- mapping of hierarchy level (0-indexed) to dictionaries of aggregated metrics 
            for that level of the hierarchy
    """

    evaluator = PointEstimateEvaluator()
    evaluations = {
        level: evaluator(
            [
                to_pandas(series)
                for series in np.array(list(test_data))[np.array(idxs)]
            ],
            predictions.values.T[np.array(idxs)],
        )
        for level, idxs in hierarchy_dict.items()
    }
    evaluations['all'] = evaluator(
        [to_pandas(series) for series in np.array(list(test_data))],
        predictions.values.T,
    )
    return evaluations
Exemple #15
0
    def _predict_batch_one_shot(
        self, dataset: Iterable[Dict], **kwargs
    ) -> Iterator[SampleForecast]:
        # TODO clean up
        # TODO optimize
        item_ids = []
        scales = []
        forecast_start_timestamps = []
        dfs = []

        for entry in dataset:
            item_ids.append(entry.get(FieldName.ITEM_ID, None))
            series, scale = self.scaling(to_pandas(entry))
            scales.append(scale)
            forecast_start = series.index[-1] + series.index.freq
            forecast_start_timestamps.append(forecast_start)
            forecast_index = pd.date_range(
                forecast_start,
                freq=series.index.freq,
                periods=self.prediction_length,
            )
            forecast_series = pd.Series(
                [None] * self.prediction_length,
                index=forecast_index,
            )
            dfs.append(
                get_features_dataframe(
                    forecast_series,
                    time_features=self.time_features,
                    lag_indices=self.lag_indices,
                    past_data=series,
                )
            )

        df = pd.concat(dfs)
        output = self.ag_model.predict(df)

        for arr, scale, forecast_start, item_id in zip(
            np.split(output, len(dfs)),
            scales,
            forecast_start_timestamps,
            item_ids,
        ):
            yield self._to_forecast(
                scale * arr,
                forecast_start,
                item_id=item_id,
            )
Exemple #16
0
def compute_time_features(
    entry: Dict,
    time_features: List[TimeFeature],
    pred_length: int = 0,
    dtype=np.float32,
):
    assert pred_length >= 0
    index = to_pandas(entry, freq=entry["start"].freq).index

    if pred_length > 0:
        index = index.union(
            pd.period_range(index[-1] + 1,
                            index[-1] + pred_length,
                            freq=index.freq))

    feature_arrays = [feat(index) for feat in time_features]
    return np.vstack(feature_arrays).astype(dtype)
Exemple #17
0
def fit_predict_arima(
    training_data: ListDataset,
    horizon: int = 12,
    output_file: str = None,
    output_residuals: bool = True,
) -> pd.DataFrame:
    """ for each time series in the training_data individually:
            1) automatically discovers the optimal order for a seasonal ARIMA model 
            2) fits discovered model
            3) makes predictions horizon length into the future

        optionally writes predictions/in-sample residuals to output file

    Arguments:
        training_data {ListDataset} -- training data
    
    Keyword Arugments:
        horizon {int} -- prediction length (default: {12})
        output_file {str} -- output_file to save predictions (default: {None})
        output_residuals {bool} -- whether to output the residuals of in-sample predictions. If True, 
            the in-sample residuals will be prepended to the out-of-sample predictions. Thus, 
            if the in-sample data contains 24 timeteps, and the out-of-sample data contains 6 timesteps,
            the output data frame will contain 30 rows (timesteps) (default: {True})

    Returns:
        pd.DataFrame -- dataframe of point predictions from individually fitted ARIMA models,
            each column represents a series and each row a future point in time
    """

    fits = [
        pm.auto_arima(to_pandas(train_series), suppress_warnings=True, error_action='ignore')
        for train_series in list(training_data)
    ]

    preds = pd.DataFrame([fit.predict(n_periods=horizon) for fit in fits]).T   

    if output_file:
        if output_residuals:
            residuals = pd.DataFrame([
                fit.predict_in_sample() - series['target'] for fit, series in zip(fits, training_data)
            ]).T
            preds = pd.concat([residuals, preds])
        preds.to_csv(output_file, index = False)

    return preds   
Exemple #18
0
    def _predict_serial(
        self, dataset: Iterable[Dict], **kwargs
    ) -> Iterator[SampleForecast]:
        for entry in dataset:
            series, scale = self.scaling(to_pandas(entry))

            forecast_index = pd.date_range(
                series.index[-1] + series.index.freq,
                freq=series.index.freq,
                periods=self.prediction_length,
            )

            forecast_series = pd.Series(
                [None] * len(forecast_index),
                index=forecast_index,
            )

            full_series = series.append(forecast_series)

            if not self.auto_regression:  # predict all at once
                df = get_features_dataframe(
                    forecast_series,
                    time_features=self.time_features,
                    lag_indices=self.lag_indices,
                    past_data=series,
                )
                full_series[forecast_series.index] = self.ag_model.predict(df)

            else:  # predict step by step
                for idx in forecast_series.index:
                    df = get_features_dataframe(
                        forecast_series[idx:idx],
                        time_features=self.time_features,
                        lag_indices=self.lag_indices,
                        past_data=full_series[:idx][:-1],
                    )
                    full_series[idx] = self.ag_model.predict(df).item()

            yield self._to_forecast(
                scale * full_series[forecast_index].values.astype(self.dtype),
                forecast_index[0],
                item_id=entry.get(FieldName.ITEM_ID, None),
            )
Exemple #19
0
 def train(self, training_data: Dataset) -> TabularPredictor:
     dfs = [
         get_features_dataframe(
             series=self.scaling(to_pandas(entry))[0],
             time_features=self.time_features,
             lag_indices=self.lag_indices,
         ) for entry in training_data
     ]
     df = pd.concat(dfs)
     ag_model = self.task.fit(df,
                              label="target",
                              problem_type="regression",
                              **self.kwargs)
     return TabularPredictor(
         ag_model=ag_model,
         freq=self.freq,
         prediction_length=self.prediction_length,
         time_features=self.time_features,
         lag_indices=self.lag_indices,
         scaling=self.scaling,
         batch_size=self.batch_size,
     )
Exemple #20
0
def test_autogluon_tabular():
    # create a dataset
    dataset = ListDataset(
        [
            {
                "start":
                pd.Timestamp("1750-01-04 00:00:00", freq="W-SUN"),
                "target":
                np.array(
                    [1089.2, 1078.91, 1099.88, 35790.55, 34096.95, 34906.95
                     ], ),
            },
            {
                "start":
                pd.Timestamp("1750-01-04 00:00:00", freq="W-SUN"),
                "target":
                np.array(
                    [1099.2, 1098.91, 1069.88, 35990.55, 34076.95, 34766.95
                     ], ),
            },
        ],
        freq="W-SUN",
    )
    prediction_length = 2
    freq = "W-SUN"
    predictor = LocalTabularPredictor(
        freq=freq,
        prediction_length=prediction_length,
    )
    forecasts_it = predictor.predict(dataset)
    forecasts = list(forecasts_it)

    for entry, forecast in zip(dataset, forecasts):
        ts = to_pandas(entry)
        start_timestamp = ts.index[-1] + pd.tseries.frequencies.to_offset(freq)
        assert forecast.samples.shape[1] == prediction_length
        assert forecast.start_date == start_timestamp

    return forecasts
Exemple #21
0
 def __init__(self,
              dataset=None,
              custom_dataset=None,
              start=None,
              freq=None,
              prediction_length=None,
              learning_length=None,
              context_length=100,
              cardinality=None):
     if dataset is not None:
         self.learning_length = len(to_pandas(next(iter(dataset.train))))
         self.prediction_length = dataset.metadata.prediction_length
         self.freq = dataset.metadata.freq
         self.test_ds = dataset.test
         self.train_ds = dataset.train
         self.context_length = context_length
         self.cardinality = list([1])
     elif custom_dataset is not None:
         self.freq = freq
         self.start = start
         self.learning_length = learning_length
         self.prediction_length = prediction_length
         self.context_length = context_length
         self.cardinality = cardinality
         # train dataset: cut the last window of length "prediction_length",
         # add "target" and "start" fields
         self.train_ds = ListDataset([{
             'target': x,
             'start': start
         } for x in custom_dataset[:, :-prediction_length]],
                                     freq=freq)
         # test dataset: use the whole dataset, add "target" and "start"
         self.test_ds = ListDataset([{
             'target': x,
             'start': start
         } for x in custom_dataset],
                                    freq=freq)
import pandas as pd
import matplotlib.pyplot as plt
csv_path = '/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/Twitter_volume_AMZN.csv'
df = pd.read_csv(csv_path,header=0,sep=',')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index(['timestamp'],inplace=True)

# print(df.value[:"2015-04-22 20:47:53"]) # 最后的时间戳是包含[2015-04-22 20:47:53]
# print(df.value[:"2015-04-23 20:47:53"]) # 如果所给时间戳超出了数据的范围的时候就会输出有的数据
# print("开始时间戳", df.index[0]) # start是开始的时间戳,target对应的是对应时间戳的序列信息
data = common.ListDataset([{'start': df.index[0], 'target': df.value[:"2015-04-22 21:00:00"]}], freq='H')#这个数据格式是固定的
# 这里df.index是时间戳,df.value是时间戳对应的值

estimator = deepar.DeepAREstimator(
    freq='H',
    prediction_length=24,
    trainer=Trainer(epochs=50)
)

predictor = estimator.train(training_data=data)

predictor.serialize(Path("/Users/seenli/Documents/workspace/code/pytorch_learn2/time_series_DL/model_save"))
for train_entry, predict_result in zip(data, predictor.predict(data)):
    to_pandas(train_entry)[-60:].plot(linewidth=2)
    predict_result.plot(color='g', prediction_intervals=[50.0, 90.0])
plt.grid(which='both')
plt.show()
##输出预测结果
prediction = next(predictor.predict(data))
print(prediction.mean)
prediction.plot(output_file='graph.png')
def generate_rolling_dataset(
    dataset: Dataset,
    strategy,
    start_time: pd.Timestamp,
    end_time: Optional[pd.Timestamp] = None,
) -> Dataset:
    """
    Returns an augmented version of the input dataset where each timeseries has
    been rolled upon based on the parameters supplied. Below follows an
    explanation and examples of how the different parameters can be used to generate
    differently rolled datasets.

    The *rolling* happens on the data available in the provided window between the
    *start_time* and the *end_time* for each timeseries. If *end_time* is omitted, rolling
    happens on all datapoints from *start_time* until the end of the timeseries.
    The way the data is rolled is governed by the strategy used.

    Below examples will be based on this one timeseries long dataset

    >>> ds = [{
    ...     "target": np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]),
    ...     "start": pd.Timestamp('2000-1-1-01', freq='1H')
    ... }]

    applying generate_rolling_dataset on this dataset like:

    >>> rolled = generate_rolling_dataset(
    ...     dataset=ds,
    ...     strategy = StepStrategy(prediction_length=2),
    ...     start_time = pd.Timestamp('2000-1-1-06', '1H'),
    ...     end_time = pd.Timestamp('2000-1-1-10', '1H')
    ... )

    Results in a new dataset as follows (only target values shown for brevity):

        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9]\n
        [1, 2, 3, 4, 5, 6, 7, 8]\n
        [1, 2, 3, 4, 5, 6, 7]\n

    i.e. maximum amount of rolls possible between the *end_time* and *start_time*.
    The StepStrategy only cuts the last value of the target for as long as
    there is enough values after *start_time* to perform predictions on.

    When no end time is provided the output is as below since all datapoints
    from *start_time* will be rolled over.

        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9]\n
        [1, 2, 3, 4, 5, 6, 7, 8]\n
        [1, 2, 3, 4, 5, 6, 7]

    One can change the step_size of the strategy as below:

    >>> strategy = StepStrategy(prediction_length=2, step_size=2)


    This causes fewer values to be in the output which,
    when prediction_length matches step_size, ensures that each prediction
    will be done on unique/new data. Below is the output when the above strategy is used.

        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n
        [1, 2, 3, 4, 5, 6, 7, 8]

    Not setting an end time and using the step_size=2 results in
    the below dataset.

        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n
        [1, 2, 3, 4, 5, 6, 7, 8, 9]\n
        [1, 2, 3, 4, 5, 6, 7]

    Parameters
    ----------
    dataset
        Dataset to generate the rolling forecasting datasets from
    strategy
        The strategy that is to be used when rolling
    start_time
        The start of the window where rolling forecasts should be applied
    end_time
        The end time of the window where rolling should be applied

    Returns
    ----------
    Dataset
        The augmented dataset


    """
    assert dataset, "a dataset to perform rolling evaluation on is needed"
    assert start_time, "a pandas Timestamp object is needed for the start time"
    assert strategy, """a strategy to use when rolling is needed, for example
        gluonts.dataset.rolling_dataset.StepStrategy"""
    if end_time:
        assert end_time > start_time, "end time has to be after the start time"

    ds = []
    for item in dataset:
        series = to_pandas(item, start_time.freq)
        base = series[:start_time][:-1].to_numpy()
        prediction_window = series[start_time:end_time]

        for window in strategy.get_windows(prediction_window):
            new_item = item.copy()
            new_item[FieldName.TARGET] = np.concatenate(
                [base, window.to_numpy()]
            )
            new_item = truncate_features(
                new_item, len(new_item[FieldName.TARGET])
            )
            ds.append(new_item)

    return ds
Exemple #24
0
import mxnet as mx
from mxnet import gluon
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

from gluonts.dataset.repository.datasets import get_dataset, dataset_recipes
from gluonts.dataset.util import to_pandas

print(f"Available datasets: {list(dataset_recipes.keys())}")

dataset = get_dataset("m4_hourly", regenerate=False)

entry = next(iter(dataset.train))
train_series = to_pandas(entry)
train_series.plot()
plt.grid(which="both")
plt.legend(["train series"], loc="upper left")
plt.show()

entry = next(iter(dataset.test))
test_series = to_pandas(entry)
test_series.plot()
plt.axvline(train_series.index[-1], color='r')  # end of train dataset
plt.grid(which="both")
plt.legend(["test series", "end of train series"], loc="upper left")
plt.show()

from gluonts.model.simple_feedforward import SimpleFeedForwardEstimator
from gluonts.trainer import Trainer
Exemple #25
0
    def train(
        self,
        training_data: Dataset,
        validation_data: Optional[Dataset] = None,
    ) -> TabularPredictor:

        kwargs_override = {}

        dfs = [
            get_features_dataframe(
                series=self.scaling(to_pandas(entry))[0],
                time_features=self.time_features,
                lag_indices=self.lag_indices,
            ) for entry in training_data
        ]
        if validation_data is not None or self.last_k_for_val is not None:
            kwargs_override["auto_stack"] = False
            logger.warning(
                "Auto Stacking is turned off "
                "as validation dataset is provided before input into Tabular Predictor."
            )

        if validation_data is not None:
            logger.log(20, "Validation dataset is directly provided.")
            validation_dfs = [
                get_features_dataframe(
                    series=self.scaling(to_pandas(entry))[0],
                    time_features=self.time_features,
                    lag_indices=self.lag_indices,
                ) for entry in validation_data
            ]
            train_df = pd.concat(dfs)
            val_df = pd.concat(validation_dfs)
        elif self.last_k_for_val is not None:
            logger.log(
                20,
                f"last_k_for_val is provided, choosing last {self.last_k_for_val} of each time series as validation set.",
            )
            train_dfs = [
                tmp_df.iloc[:-self.last_k_for_val, :] for tmp_df in dfs
            ]
            validation_dfs = [
                tmp_df.iloc[-self.last_k_for_val:, :] for tmp_df in dfs
            ]
            train_df = pd.concat(train_dfs)
            val_df = pd.concat(validation_dfs)
        else:
            logger.log(
                20,
                "No validation dataset is provided, will let TabularPredictor do the splitting automatically,"
                "Note that this might break the time order of time series data.",
            )
            train_df = pd.concat(dfs)
            val_df = None

        if self.quantiles_to_predict is not None:
            ag_model = AutogluonTabularPredictor(
                label="target",
                problem_type="quantile",
                quantile_levels=self.quantiles_to_predict,
            ).fit(
                train_df,
                tuning_data=val_df,
                **{
                    **self.kwargs,
                    **kwargs_override
                },
            )
        else:
            ag_model = AutogluonTabularPredictor(
                label="target",
                problem_type="regression",
                eval_metric=self.eval_metric,
            ).fit(
                train_df,
                tuning_data=val_df,
                **{
                    **self.kwargs,
                    **kwargs_override
                },
            )

        return TabularPredictor(
            ag_model=ag_model,
            freq=self.freq,
            prediction_length=self.prediction_length,
            time_features=self.time_features,
            lag_indices=self.lag_indices,
            scaling=self.scaling,
            batch_size=self.batch_size,
            quantiles_to_predict=self.quantiles_to_predict,
        )
Exemple #26
0
def quick_start_tutorial():
	# Provided datasets.

	print(f"Available datasets: {list(dataset_recipes.keys())}")

	dataset = get_dataset("m4_hourly", regenerate=True)

	entry = next(iter(dataset.train))

	plt.figure()
	train_series = to_pandas(entry)
	train_series.plot()
	plt.grid(which="both")
	plt.legend(["train series"], loc="upper left")

	entry = next(iter(dataset.test))

	plt.figure()
	test_series = to_pandas(entry)
	test_series.plot()
	plt.axvline(train_series.index[-1], color="r")  # End of train dataset.
	plt.grid(which="both")
	plt.legend(["test series", "end of train series"], loc="upper left")

	plt.show()

	#--------------------
	# Custom datasets.

	N = 10  # Number of time series.
	T = 100  # Number of timesteps.
	prediction_length = 24
	freq = "1H"
	custom_dataset = np.random.normal(size=(N, T))
	start = pd.Timestamp("01-01-2019", freq=freq)  # Can be different for each time series.

	# Train dataset: cut the last window of length "prediction_length", add "target" and "start" fields.
	train_ds = ListDataset(
		[{"target": x, "start": start} for x in custom_dataset[:, :-prediction_length]],
		freq=freq
	)
	# Test dataset: use the whole dataset, add "target" and "start" fields.
	test_ds = ListDataset(
		[{"target": x, "start": start} for x in custom_dataset],
		freq=freq
	)

	#--------------------
	# Training an existing model (Estimator).

	estimator = SimpleFeedForwardEstimator(
		num_hidden_dimensions=[10],
		prediction_length=dataset.metadata.prediction_length,
		context_length=100,
		freq=dataset.metadata.freq,
		trainer=Trainer(
			ctx="cpu",
			epochs=5,
			learning_rate=1e-3,
			num_batches_per_epoch=100
		)
	)

	predictor = estimator.train(dataset.train)

	#--------------------
	# Visualize and evaluate forecasts.

	forecast_it, ts_it = make_evaluation_predictions(
		dataset=dataset.test,  # Test dataset.
		predictor=predictor,  # Predictor.
		num_samples=100,  # Number of sample paths we want for evaluation.
	)

	forecasts = list(forecast_it)
	tss = list(ts_it)

	# First entry of the time series list.
	ts_entry = tss[0]

	# First 5 values of the time series (convert from pandas to numpy).
	print(np.array(ts_entry[:5]).reshape(-1,))

	# First entry of dataset.test.
	dataset_test_entry = next(iter(dataset.test))

	# First 5 values.
	print(dataset_test_entry["target"][:5])

	# First entry of the forecast list.
	forecast_entry = forecasts[0]

	print(f"Number of sample paths: {forecast_entry.num_samples}")
	print(f"Dimension of samples: {forecast_entry.samples.shape}")
	print(f"Start date of the forecast window: {forecast_entry.start_date}")
	print(f"Frequency of the time series: {forecast_entry.freq}")

	print(f"Mean of the future window:\n {forecast_entry.mean}")
	print(f"0.5-quantile (median) of the future window:\n {forecast_entry.quantile(0.5)}")

	def plot_prob_forecasts(ts_entry, forecast_entry):
		plot_length = 150
		prediction_intervals = (50.0, 90.0)
		legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]

		fig, ax = plt.subplots(1, 1, figsize=(10, 7))
		ts_entry[-plot_length:].plot(ax=ax)  # Plot the time series.
		forecast_entry.plot(prediction_intervals=prediction_intervals, color="g")
		plt.grid(which="both")
		plt.legend(legend, loc="upper left")
		plt.show()

	plot_prob_forecasts(ts_entry, forecast_entry)

	evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9])
	agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test))

	print(json.dumps(agg_metrics, indent=4))
	print(item_metrics.head())

	item_metrics.plot(x="MSIS", y="MASE", kind="scatter")
	plt.grid(which="both")

	plt.show()
    def cv_harness_trainer(self):

        list_y_pred = []
        list_score = []

        models = self.model
        type_time_series = self.type_time_series

        train_data = self.train_data
        val_data = self.val_data
        n_splits = self.n_splits
        out_path = self.out_path
        freq = self.freq
        prediction_length = self.prediction_length
        start_date = self.start_date
        train_entry = next(iter(train_data))
        val_entry = next(iter(val_data))

        for models_ in models.get(type_time_series):

            for key, value in models_.items():
                model_name = key
                model = value[0]
                model_kwarg = value[1]

                self.__log.info(
                    "Starting generic cv train loop for Type_of_time_series={}, mlmodel={}, modelkwarg={}"
                    .format(type_time_series, model_name, model_kwarg))

                self.__log.info("Regression")
                splits = TimeSeriesSplit(n_splits=n_splits)

                X_train = to_pandas(train_entry).values
                X_val = to_pandas(val_entry).values

                # start the cross validation loop
                self._internal_cv_trainer(
                    models=model,
                    model_name=model_name,
                    X_train=X_train,
                    X_test=X_test,
                    ss=splits,
                    n_splits=n_splits,
                    list_y_pred=list_y_pred,
                    list_score=list_score,
                    out_path=out_path,
                    start_date=start_date,
                    prediction_length=prediction_length,
                    freq=freq,
                    type_time_series=type_time_series,
                )

                # output the dataframe of predicted vals with index as sample numbers
                y_pred_list_df = pd.concat(list_y_pred, axis=1)

                # one liner to remove duplicate columns
                y_pred_list_df = y_pred_list_df.loc[:, ~y_pred_list_df.columns.
                                                    duplicated()]
                y_pred_list_df.set_index("index", inplace=True)

                y_pred_list_df_path_csv = os.path.join(
                    out_path,
                    "saved_models",
                    "y_pred_list_df.csv",
                )

                y_pred_list_df.to_csv(y_pred_list_df_path_csv)

                # output the dataframe of scores with index as sample numbers
                score_list_df = pd.concat(list_score, axis=1)

                # one liner to remove duplicate columns
                score_list_df = score_list_df.loc[:, ~score_list_df.columns.
                                                  duplicated()]

                score_list_df_path_csv = os.path.join(
                    out_path, "saved_models",
                    str(type_time_series + "_" + model_name + "_" +
                        "regression" + "_score_list_df.csv"))
                # print(score_list_df)
                # print(score_list_df_path_csv)
                score_list_df.to_csv(score_list_df_path_csv)

        return y_pred_list_df, score_list_df
    def _internal_cv_trainer(
        self,
        model_name: str,
        models: object,
        type_time_series: str,
        X: np.array,
        ss: object,
        n_splits: int,
        list_y_pred: list,
        list_score: list,
        out_path: str,
        freq: int,
        prediction_length: int,
        start_date: str,
    ):
        """ Cross validation training loop for an individual regression models.

        Parameters
        ----------
        :param: ``X`` : ``np.array``
            Array of time series data.

        :param: ``model_name`` : ``str``
            Machine learning model type.

        :param: ``models`` : ``regression model class object``
            Within the training loop, an instantiated model is passed to this method.

        :param: ``type_time_series`` : ``dict``
            Type of time series to train model. eg teams or by position

        :param: ``prediction_length`` : ``str``
            Length of the prediction horizon

        :param: ``start`` : ``np.array``
            Start date of the time series

        :param: ``freq`` : ``str eg. '1D', '2H', '3S'...``
            Frequency of the data to train on and predict 

        :param: ``ss`` : ``scikit-learn split iterator object``
            This is an instantiated split iterator object to control time series cross validation
            splitting within the cross validator.

        :param: ``n_splits`` : ``int``
            The number of splits to divide the data into during cross validation.

        :param: ``list_y_pred`` : ``list``
            A tracking list entitity for the predicted values within each model cross-validation loop.

        :param: ``list_score`` : ``list``
            A tracking list entitity for the scored values within each model cross-validation loop.

        :param: ``out_path`` : ``str``
            his is the filepath of generated models and scores.


        Returns
        -------
        :return: ``None``

        """

        split_num = np.int(0)
        y_test_indices = []

        y_pred_df = pd.DataFrame()
        y_pred_ = []
        y_true_ = []

        score_df = pd.DataFrame()
        score_1 = []
        score_2 = []

        for train_index, test_index in ss.split(X=X):

            split_num += 1
            self.__log.info("%%--%%")
            self.__log.info("Cross fold: %i of %i", split_num, n_splits)

            # a workaround made here as the test set after split is not utilised. The training set is split
            dataset, X_test = X[train_index], X[test_index]
            start = pd.Timestamp(start_date, freq)
            X_train = ListDataset([{
                'target': x,
                'start': start
            } for x in dataset.reshape(1, -1)[:, :-prediction_length]],
                                  freq='1H')
            X_test = ListDataset([{
                'target': x,
                'start': start
            } for x in dataset.reshape(1, -1)],
                                 freq='1H')
            y_pred_temp, score_1_temp, score_2_temp = self._cv_train_model_other(
                models=models,
                #                     model_kwarg=model_kwarg,
                X_train=X_train,
                X_test=X_test,
                split_num=split_num,
                out_path=out_path,
                model_name=model_name,
                type_time_series=type_time_series)

            self.__log.info("Score_1_temp={}".format(score_1_temp))

            dataset_ = next(iter(X_test))
            dataset_pd = to_pandas(dataset_)
            y_test = dataset_pd[-prediction_length:].index
            y_test_indices.append(y_test.values)
            y_pred_.append(y_pred_temp)
            y_true_.append(dataset_pd.loc[y_test].values)

            # keep track of the scores during loops
            score_1.append([score_1_temp])
            score_2.append([score_2_temp])

        y_pred_df[str(model_name + "_" +
                      "regression")] = np.concatenate(y_pred_).ravel()

        y_pred_df["index"] = np.concatenate(y_test_indices).ravel()

        y_pred_df[str("y_true_val")] = np.concatenate(y_true_).ravel()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "mse")] = np.concatenate(score_1).ravel()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "rmse")] = np.concatenate(score_2).ravel()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_mse_ave")] = score_df[str(model_name + "_" +
                                                       "regression" + "_" +
                                                       "mse")].mean()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_mse_med")] = score_df[str(model_name + "_" +
                                                       "regression" + "_" +
                                                       "rmse")].median()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_mse_std")] = score_df[str(model_name + "_" +
                                                       "regression" + "_" +
                                                       "mse")].std()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_rmse_ave")] = score_df[str(model_name + "_" +
                                                        "regression" + "_" +
                                                        "rmse")].mean()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_rmse_med")] = score_df[str(model_name + "_" +
                                                        "regression" + "_" +
                                                        "rmse")].median()

        score_df[str(model_name + "_" + "regression" + "_" +
                     "global_rmse_std")] = score_df[str(model_name + "_" +
                                                        "regression" + "_" +
                                                        "rmse")].std()

        path_score_list_path_csv = os.path.join(
            out_path,
            "saved_models",
            str(
                str(type_time_series) + "_" + str(model_name) + "_" +
                str(split_num) + "_" + "regression" + "_" +
                "_cv_score_list.csv"),
        )

        score_df.to_csv(path_score_list_path_csv)

        list_y_pred.append(y_pred_df)
        list_score.append(score_df)
)

# Create testing datatset
testing_data_plots = ListDataset(
    [{"start": test_data[target_asset].index[0],
      "target": test_data[target_asset]}],
    freq = "1d"
)
#Create the estimator and train
estimator = DeepAREstimator(freq="1d", prediction_length=pred_len, trainer=Trainer(epochs=100))
predictor = estimator.train(training_data=training_data)

### OPTIONAL PLOT PREDICTION RESULTS
#Forecast
for test_entry, forecast in zip(testing_data_plots, predictor.predict(testing_data)):
    to_pandas(test_entry)[-60:].plot(linewidth=2)
    forecast.plot(color='g', prediction_intervals=[50.0, 90.0])
    
### GENERATE FORECASTS  
from gluonts.evaluation import Evaluator
from gluonts.evaluation.backtest import make_evaluation_predictions
forecast_it, ts_it = make_evaluation_predictions(
    dataset=testing_data,  # test dataset
    predictor=predictor,  # predictor
    num_samples=100,  # number of sample paths we want for evaluation
)


###### VERY SIMPLE TRADING STRATEGY
#      Signal: If Forecast > Current Price, Buy
                     "     <      "       , Short
Exemple #30
0
    freq="H")

estimator = deepar.DeepAREstimator(prediction_length=24,
                                   context_length=100,
                                   use_feat_static_cat=True,
                                   use_feat_dynamic_real=True,
                                   num_parallel_samples=100,
                                   cardinality=[2, 1],
                                   freq="H",
                                   trainer=Trainer(ctx="cpu",
                                                   epochs=200,
                                                   learning_rate=1e-3))
predictor = estimator.train(training_data=train_data)

for test_entry, forecast in zip(test_data, predictor.predict(test_data)):
    to_pandas(test_entry)[-100:].plot(figsize=(12, 5), linewidth=2)
    forecast.plot(color='g', prediction_intervals=[50.0, 90.0])
plt.grid(which='both')
plt.legend([
    "past observations", "median prediction", "90% prediction interval",
    "50% prediction interval"
])
plt.show()

prediction = next(predictor.predict(test_data))
print(prediction.mean)
prediction.plot(output_file='graph.png')

predictor.serialize(
    Path("/home/root/mxnetTS/GluonTS-Learning-in-Action/chapter-2/model"))
# predictor = Predictor.deserialize(Path("/home/root/mxnetTS/GluonTS-Learning-in-Action/chapter-2/model"))