Beispiel #1
0
    def fit(self,
            X_df,
            y_df,
            X_test_df=None,
            y_test_df=None,
            y_hat_benchmark='y_hat_naive2',
            warm_start=False,
            shuffle=True,
            verbose=True):
        """
    Fit ESRNN model.

    Parameters
    ----------
    X_df : pandas dataframe
      Train dataframe in long format with columns 'unique_id', 'ds' 
      and 'x'.
      - 'unique_id' an identifier of each independent time series.
      - 'ds' is a datetime column
      - 'x' is a single exogenous variable
    y_df : pandas dataframe
      Train dataframe in long format with columns 'unique_id', 'ds' and 'y'.
      - 'unique_id' an identifier of each independent time series.
      - 'ds' is a datetime column
      - 'y' is the column with the target values
    X_test_df: pandas dataframe
      Optional test dataframe with columns 'unique_id', 'ds' and 'x'.
      If provided the fit procedure will evaluate the intermediate 
      performance within training epochs.
    y_test_df: pandas dataframe
      Optional test dataframe with columns 'unique_id', 'ds' and 'x' and
      y_hat_benchmark column.
      If provided the fit procedure will evaluate the intermediate 
      performance within training epochs.
    y_hat_benchmark: str
      Name of the benchmark model for the comparison of the relative
      improvement of the model.
    
    Returns
    -------
    self : returns an instance of self.
    """

        # Transform long dfs to wide numpy
        assert type(X_df) == pd.core.frame.DataFrame
        assert type(y_df) == pd.core.frame.DataFrame
        assert all([(col in X_df) for col in ['unique_id', 'ds', 'x']])
        assert all([(col in y_df) for col in ['unique_id', 'ds', 'y']])
        if y_test_df is not None:
            assert y_hat_benchmark in y_test_df.columns, 'benchmark is not present in y_test_df, use y_hat_benchmark to define it'

        # Storing dfs for OWA evaluation, initializing min_owa
        self.y_train_df = y_df
        self.X_test_df = X_test_df
        self.y_test_df = y_test_df
        self.min_owa = 4.0
        self.min_epoch = 0

        self.int_ds = isinstance(self.y_train_df['ds'][0],
                                 (int, np.int, np.int64))

        self.y_hat_benchmark = y_hat_benchmark

        X, y = self.long_to_wide(X_df, y_df)
        assert len(X) == len(y)
        assert X.shape[1] >= 3

        # Exogenous variables
        unique_categories = np.unique(X[:, 1])
        self.mc.category_to_idx = dict(
            (word, index) for index, word in enumerate(unique_categories))
        exogenous_size = len(unique_categories)

        # Create batches (device in mc)
        self.train_dataloader = Iterator(mc=self.mc, X=X, y=y)

        # Random Seeds (model initialization)
        torch.manual_seed(self.mc.random_seed)
        np.random.seed(self.mc.random_seed)

        # Initialize model
        n_series = self.train_dataloader.n_series
        self.instantiate_esrnn(exogenous_size, n_series)

        # Validating frequencies
        X_train_frequency = pd.infer_freq(X_df.head()['ds'])
        y_train_frequency = pd.infer_freq(y_df.head()['ds'])
        self.frequencies = [X_train_frequency, y_train_frequency]

        if (X_test_df is not None) and (y_test_df is not None):
            X_test_frequency = pd.infer_freq(X_test_df.head()['ds'])
            y_test_frequency = pd.infer_freq(y_test_df.head()['ds'])
            self.frequencies += [X_test_frequency, y_test_frequency]

        assert len(set(self.frequencies)) <= 1, \
          "Match the frequencies of the dataframes {}".format(self.frequencies)

        self.mc.frequency = self.frequencies[0]
        print("Infered frequency: {}".format(self.mc.frequency))

        # Train model
        self._fitted = True
        self.train(dataloader=self.train_dataloader,
                   max_epochs=self.mc.max_epochs,
                   warm_start=warm_start,
                   shuffle=shuffle,
                   verbose=verbose)
Beispiel #2
0
    def predict(self, X_df):
        """
    Predict using the ESRNN ensemble.

    Parameters
    ----------
    X_df : pandas dataframe
      Dataframe in LONG format with columns 'unique_id', 'ds' 
      and 'x'.
      - 'unique_id' an identifier of each independent time series.
      - 'ds' is a datetime column
      - 'x' is a single exogenous variable

    Returns
    -------
    Y_hat_panel : pandas dataframe
      Dataframe in LONG format with columns 'unique_id', 'ds' 
      and 'x'.
      - 'unique_id' an identifier of each independent time series.
      - 'ds' datetime columnn that matches the dates in X_df
      - 'y_hat' is the column with the predicted target values
    """

        assert type(X_df) == pd.core.frame.DataFrame
        assert 'unique_id' in X_df
        assert self._fitted, "Model not fitted yet"

        dataloader = Iterator(mc=self.mc, X=self.X, y=self.y)

        output_size = self.mc.output_size
        n_unique_id = len(dataloader.sort_key['unique_id'])

        ensemble_y_hat = np.zeros((self.n_models, n_unique_id, output_size))

        for model_id, esrnn in enumerate(self.esrnn_ensemble):
            esrnn.esrnn.eval()

            # Predict ALL series
            count = 0
            for j in range(dataloader.n_batches):
                batch = dataloader.get_batch()
                batch_size = batch.y.shape[0]

                y_hat = esrnn.esrnn.predict(batch)

                y_hat = y_hat.data.cpu().numpy()

                ensemble_y_hat[model_id, count:count + batch_size, :] = y_hat
                count += batch_size

        # Weighted average of prediction for n_top best models per series
        # (n_models x n_unique_id x output_size) (n_unique_id x n_models)
        y_hat = np.einsum('ijk,ji->jk', ensemble_y_hat,
                          self.series_models_map) / self.n_top
        y_hat = y_hat.flatten()

        panel_unique_id = pd.Series(
            dataloader.sort_key['unique_id']).repeat(output_size)
        panel_last_ds = pd.Series(dataloader.X[:, 2]).repeat(output_size)

        panel_delta = list(range(1, output_size + 1)) * n_unique_id
        panel_delta = pd.to_timedelta(panel_delta, unit=self.mc.frequency)
        panel_ds = panel_last_ds + panel_delta

        assert len(panel_ds) == len(y_hat) == len(panel_unique_id)

        Y_hat_panel_dict = {
            'unique_id': panel_unique_id,
            'ds': panel_ds,
            'y_hat': y_hat
        }

        Y_hat_panel = pd.DataFrame.from_dict(Y_hat_panel_dict)

        if 'ds' in X_df:
            Y_hat_panel = X_df.merge(Y_hat_panel,
                                     on=['unique_id', 'ds'],
                                     how='left')
        else:
            Y_hat_panel = X_df.merge(Y_hat_panel, on=['unique_id'], how='left')

        return Y_hat_panel
    def predict(self, X_df):
        """
        Predictions for all stored time series
    Returns:
        Y_hat_panel : array-like (n_samples, 1).
          Predicted values for models in Family for ids in Panel.
        ds: Corresponding list of date stamps
        unique_id: Corresponding list of unique_id
    """
        assert type(X_df) == pd.core.frame.DataFrame
        assert 'unique_id' in X_df
        assert self._fitted, "Model not fitted yet"

        dataloader = Iterator(mc=self.mc, X=self.X, y=self.y)

        output_size = self.mc.output_size
        n_unique_id = len(dataloader.sort_key['unique_id'])

        ensemble_y_hat = np.zeros((self.n_models, n_unique_id, output_size))

        for model_id, esrnn in enumerate(self.esrnn_ensemble):
            esrnn.esrnn.eval()

            # Predict ALL series
            count = 0
            for j in range(dataloader.n_batches):
                batch = dataloader.get_batch()
                batch_size = batch.y.shape[0]

                y_hat = esrnn.esrnn.predict(batch)

                y_hat = y_hat.data.cpu().numpy()

                ensemble_y_hat[model_id, count:count + batch_size, :] = y_hat
                count += batch_size

        # Weighted average of prediction for n_top best models per series
        # (n_models x n_unique_id x output_size) (n_unique_id x n_models)
        y_hat = np.einsum('ijk,ji->jk', ensemble_y_hat,
                          self.series_models_map) / self.n_top
        y_hat = y_hat.flatten()

        panel_unique_id = pd.Series(
            dataloader.sort_key['unique_id']).repeat(output_size)
        panel_last_ds = pd.Series(dataloader.X[:, 2]).repeat(output_size)

        panel_delta = list(range(1, output_size + 1)) * n_unique_id
        panel_delta = pd.to_timedelta(panel_delta, unit=self.mc.frequency)
        panel_ds = panel_last_ds + panel_delta

        assert len(panel_ds) == len(y_hat) == len(panel_unique_id)

        Y_hat_panel_dict = {
            'unique_id': panel_unique_id,
            'ds': panel_ds,
            'y_hat': y_hat
        }

        Y_hat_panel = pd.DataFrame.from_dict(Y_hat_panel_dict)

        if 'ds' in X_df:
            Y_hat_panel = X_df.merge(Y_hat_panel,
                                     on=['unique_id', 'ds'],
                                     how='left')
        else:
            Y_hat_panel = X_df.merge(Y_hat_panel, on=['unique_id'], how='left')

        return Y_hat_panel
Beispiel #4
0
    def train(self):
        """
    Auxiliary function, pytorch train procedure for the ESRNN ensemble

    Parameters:
    -------
    self: instance of self.
    
    Returns
    -------
    self : returns an instance of self.
    """

        # Initial performance matrix
        self.performance_matrix = np.ones(
            (self.mc.n_series, self.n_models)) * self.big_float
        warm_start = False
        train_tau = self.mc.training_percentile / 100
        criterion = DisaggregatedPinballLoss(train_tau)

        # Train epoch loop
        for epoch in range(self.mc.max_epochs):
            start = time.time()

            # Solve degenerate models
            for model_id in range(self.n_models):
                if np.sum(self.series_models_map[:, model_id]) == 0:
                    print('Reassigning random series to model ', model_id)
                    n_sample_series = int(self.mc.n_series / 2)
                    index_series = np.random.choice(self.mc.n_series,
                                                    n_sample_series,
                                                    replace=False)
                    self.series_models_map[index_series, model_id] = 1

            # Model loop
            for model_id, esrnn in enumerate(self.esrnn_ensemble):
                # Train model with subset data
                dataloader = Iterator(mc=self.mc,
                                      X=self.X,
                                      y=self.y,
                                      weights=self.series_models_map[:,
                                                                     model_id])
                esrnn.train(dataloader,
                            max_epochs=1,
                            warm_start=warm_start,
                            shuffle=self.shuffle,
                            verbose=False)

                # Compute model performance for each series
                dataloader = Iterator(mc=self.mc, X=self.X, y=self.y)
                per_series_evaluation = esrnn.per_series_evaluation(
                    dataloader, criterion=criterion)
                self.performance_matrix[:, model_id] = per_series_evaluation

            # Reassign series to models
            self.series_models_map = np.zeros(
                (self.mc.n_series, self.n_models))
            top_models = np.argpartition(self.performance_matrix,
                                         self.n_top)[:, :self.n_top]
            for i in range(self.mc.n_series):
                self.series_models_map[i, top_models[i, :]] = 1

            warm_start = True

            print("========= Epoch {} finished =========".format(epoch))
            print("Training time: {}".format(round(time.time() - start, 5)))
            self.train_loss = np.einsum('ij,ij->i', self.performance_matrix,
                                        self.series_models_map) / self.n_top
            self.train_loss = np.mean(self.train_loss)
            print("Training loss ({} prc): {:.5f}".format(
                self.mc.training_percentile, self.train_loss))
            print('Models num series', np.sum(self.series_models_map, axis=0))

            if (epoch % self.mc.freq_of_test
                    == 0) and (self.mc.freq_of_test > 0):
                if self.y_test_df is not None:
                    self.evaluate_model_prediction(self.y_train_df,
                                                   self.X_test_df,
                                                   self.y_test_df,
                                                   epoch=epoch)
        print('Train finished! \n')
Beispiel #5
0
    def fit(self,
            X_df,
            y_df,
            X_test_df=None,
            y_test_df=None,
            y_hat_benchmark='y_hat_naive2',
            warm_start=False,
            shuffle=True,
            verbose=True):
        # Transform long dfs to wide numpy
        assert type(X_df) == pd.core.frame.DataFrame
        assert type(y_df) == pd.core.frame.DataFrame
        assert all([(col in X_df) for col in ['unique_id', 'ds', 'x']])
        assert all([(col in y_df) for col in ['unique_id', 'ds', 'y']])
        if y_test_df is not None:
            assert y_hat_benchmark in y_test_df.columns, 'benchmark is not present in y_test_df, use y_hat_benchmark to define it'

        # Storing dfs for OWA evaluation, initializing min_owa
        self.y_train_df = y_df
        self.X_test_df = X_test_df
        self.y_test_df = y_test_df
        self.min_owa = 4.0
        self.min_epoch = 0

        self.int_ds = isinstance(self.y_train_df['ds'][0],
                                 (int, np.int, np.int64))

        self.y_hat_benchmark = y_hat_benchmark

        X, y = self.long_to_wide(X_df, y_df)
        assert len(X) == len(y)
        assert X.shape[1] >= 3

        # Exogenous variables
        unique_categories = np.unique(X[:, 1])
        self.mc.category_to_idx = dict(
            (word, index) for index, word in enumerate(unique_categories))
        exogenous_size = len(unique_categories)

        # Create batches (device in mc)
        self.train_dataloader = Iterator(mc=self.mc, X=X, y=y)

        # Random Seeds (model initialization)
        torch.manual_seed(self.mc.random_seed)
        np.random.seed(self.mc.random_seed)

        # Initialize model
        n_series = self.train_dataloader.n_series
        self.instantiate_esrnn(exogenous_size, n_series)

        # Infer freq of model
        if self.mc.frequency is None:
            self.mc.frequency = pd.infer_freq(X_df.head()['ds'])
            print("Infered frequency: {}".format(self.mc.frequency))

        # Train model
        self._fitted = True
        self.train(dataloader=self.train_dataloader,
                   max_epochs=self.mc.max_epochs,
                   warm_start=warm_start,
                   shuffle=shuffle,
                   verbose=verbose)