Exemple #1
0
    def test_roll_timeseries_dataframe(self):
        x, y = roll_timeseries_dataframe(self.easy_data,
                                         lookback=self.lookback,
                                         horizon=[1, 3],
                                         feature_col=["A"],
                                         target_col=["B"])
        assert x.shape == (8-self.lookback, self.lookback, 2)
        assert y.shape == (8-self.lookback, 2, 1)

        x, y = roll_timeseries_dataframe(self.easy_data,
                                         lookback=self.lookback,
                                         horizon=4,
                                         feature_col=["A", "C"],
                                         target_col=["B"])
        assert x.shape == (7-self.lookback, self.lookback, 3)
        assert y.shape == (7-self.lookback, 4, 1)

        x, y = roll_timeseries_dataframe(self.easy_data,
                                         lookback=2,
                                         horizon=0,
                                         feature_col=[],
                                         target_col=["A"])
        assert x.shape == (9, 2, 1)
        assert y is None

        self.easy_data["A"][0] = None
        x, y = roll_timeseries_dataframe(self.easy_data,
                                         lookback=2,
                                         horizon=0,
                                         feature_col=[],
                                         target_col=["A"])
        assert x.shape == (8, 2, 1)
        assert y is None

        x, y = roll_timeseries_dataframe(self.easy_data,
                                         lookback=2,
                                         horizon=2,
                                         feature_col=["C"],
                                         target_col=["A"])
        assert x.shape == (6, 2, 2)
        assert y.shape == (6, 2, 1)
Exemple #2
0
    def roll(self,
             lookback,
             horizon,
             feature_col=None,
             target_col=None,
             id_sensitive=False):
        '''
        Sampling by rolling for machine learning/deep learning models.

        :param lookback: int, lookback value
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is an list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicate the feature col name. Default to None,
               where we will take all avaliable feature in rolling.
        :param target_col: str or list, indicate the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialized the tsdataset.
        :param id_sensitive: bool,
               if `id_sensitive` is False, we will rolling on each id's sub dataframe
               and fuse the sampings.
               The shape of rolling will be
               x: (num_sample, lookback, num_feature_col)
               y: (num_sample, horizon, num_target_col)
               where num_sample is the summation of sample number of each dataframe

               if `id_sensitive` is True, we will rolling on the wide dataframe whose
               columns are cartesian product of id_col and feature_col
               The shape of rolling will be
               x: (num_sample, lookback, num_feature_col)
               y: (num_sample, horizon, num_target_col)
               where num_sample is the sample number of the wide dataframe,
               num_feature_col is the product of the number of id and the number of feature_col,
               num_target_col is the product of the number of id and the number of target_col.

        :return: the tsdataset instance.
        '''
        feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
            else self.feature_col
        target_col = _to_list(target_col, "target_col") if target_col is not None \
            else self.target_col
        if self.roll_addional_feature:
            additional_feature_col =\
                list(set(feature_col).intersection(set(self.roll_addional_feature)))
            feature_col =\
                list(set(feature_col) - set(self.roll_addional_feature))
            self.roll_feature = feature_col + additional_feature_col
        else:
            additional_feature_col = None
            self.roll_feature = feature_col

        self.roll_target = target_col
        num_id = len(self._id_list)
        num_feature_col = len(self.roll_feature)
        num_target_col = len(self.roll_target)
        self.id_sensitive = id_sensitive
        roll_feature_df = None if self.roll_feature_df is None \
            else self.roll_feature_df[additional_feature_col]

        # get rolling result for each sub dataframe
        rolling_result = [roll_timeseries_dataframe(df=self.df[self.df[self.id_col] == id_name],
                                                    roll_feature_df=roll_feature_df,
                                                    lookback=lookback,
                                                    horizon=horizon,
                                                    feature_col=feature_col,
                                                    target_col=target_col)
                          for id_name in self._id_list]

        # concat the result on required axis
        concat_axis = 2 if id_sensitive else 0
        self.numpy_x = np.concatenate([rolling_result[i][0]
                                       for i in range(num_id)],
                                      axis=concat_axis)
        if horizon != 0:
            self.numpy_y = np.concatenate([rolling_result[i][1]
                                           for i in range(num_id)],
                                          axis=concat_axis)
        else:
            self.numpy_y = None

        # target first
        if self.id_sensitive:
            feature_start_idx = num_target_col*num_id
            reindex_list = [list(range(i*num_target_col, (i+1)*num_target_col)) +
                            list(range(feature_start_idx+i*num_feature_col,
                                       feature_start_idx+(i+1)*num_feature_col))
                            for i in range(num_id)]
            reindex_list = functools.reduce(lambda a, b: a+b, reindex_list)
            self.numpy_x = self.numpy_x[:, :, reindex_list]

        return self
    def roll(self,
             lookback,
             horizon,
             feature_col=None,
             target_col=None,
             id_sensitive=False):
        '''
        Sampling by rolling for machine learning/deep learning models.

        :param lookback: int, lookback value.
        :param horizon: int or list,
               if `horizon` is an int, we will sample `horizon` step
               continuously after the forecasting point.
               if `horizon` is a list, we will sample discretely according
               to the input list.
               specially, when `horizon` is set to 0, ground truth will be generated as None.
        :param feature_col: str or list, indicates the feature col name. Default to None,
               where we will take all available feature in rolling.
        :param target_col: str or list, indicates the target col name. Default to None,
               where we will take all target in rolling. it should be a subset of target_col
               you used to initialize the tsdataset.
        :param id_sensitive: bool,
               if `id_sensitive` is False, we will rolling on each id's sub dataframe
               and fuse the sampings.
               The shape of rolling will be
               x: (num_sample, lookback, num_feature_col + num_target_col)
               y: (num_sample, horizon, num_target_col)
               where num_sample is the summation of sample number of each dataframe

               if `id_sensitive` is True, we will rolling on the wide dataframe whose
               columns are cartesian product of id_col and feature_col
               The shape of rolling will be
               x: (num_sample, lookback, new_num_feature_col + new_num_target_col)
               y: (num_sample, horizon, new_num_target_col)
               where num_sample is the sample number of the wide dataframe,
               new_num_feature_col is the product of the number of id and the number of feature_col.
               new_num_target_col is the product of the number of id and the number of target_col.

        :return: the tsdataset instance.

        roll() can be called by:

        >>> # Here is a df example:
        >>> # id        datetime      value   "extra feature 1"   "extra feature 2"
        >>> # 00        2019-01-01    1.9     1                   2
        >>> # 01        2019-01-01    2.3     0                   9
        >>> # 00        2019-01-02    2.4     3                   4
        >>> # 01        2019-01-02    2.6     0                   2
        >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime",
        >>>                                   target_col="value", id_col="id",
        >>>                                   extra_feature_col=["extra feature 1",
        >>>                                                      "extra feature 2"])
        >>> horizon, lookback = 1, 1
        >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False)
        >>> x, y = tsdataset.to_numpy()
        >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]]
        >>> print(x.shape, y.shape) # x.shape = (2, 1, 3) y.shape = (2, 1, 1)
        >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=True)
        >>> x, y = tsdataset.to_numpy()
        >>> print(x, y) # x = [[[ 1.9, 2.3, 1, 2, 0, 9 ]]] y = [[[ 2.4, 2.6]]]
        >>> print(x.shape, y.shape) # x.shape = (1, 1, 6) y.shape = (1, 1, 2)

        '''
        feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \
            else self.feature_col
        target_col = _to_list(target_col, "target_col") if target_col is not None \
            else self.target_col
        if self.roll_addional_feature:
            additional_feature_col =\
                list(set(feature_col).intersection(set(self.roll_addional_feature)))
            feature_col =\
                list(set(feature_col) - set(self.roll_addional_feature))
            self.roll_feature = feature_col + additional_feature_col
        else:
            additional_feature_col = None
            self.roll_feature = feature_col

        self.roll_target = target_col
        num_id = len(self._id_list)
        num_feature_col = len(self.roll_feature)
        num_target_col = len(self.roll_target)
        self.id_sensitive = id_sensitive
        roll_feature_df = None if self.roll_feature_df is None \
            else self.roll_feature_df[additional_feature_col]

        rolling_result =\
            self.df.groupby([self.id_col])\
                   .apply(lambda df: roll_timeseries_dataframe(df=df,
                                                               roll_feature_df=roll_feature_df,
                                                               lookback=lookback,
                                                               horizon=horizon,
                                                               feature_col=feature_col,
                                                               target_col=target_col))

        # concat the result on required axis
        concat_axis = 2 if id_sensitive else 0
        self.numpy_x = np.concatenate(
            [rolling_result[i][0] for i in self._id_list],
            axis=concat_axis).astype(np.float64)
        if horizon != 0:
            self.numpy_y = np.concatenate(
                [rolling_result[i][1] for i in self._id_list],
                axis=concat_axis).astype(np.float64)
        else:
            self.numpy_y = None

        # target first
        if self.id_sensitive:
            feature_start_idx = num_target_col * num_id
            reindex_list = [
                list(range(i * num_target_col, (i + 1) * num_target_col)) +
                list(
                    range(feature_start_idx + i * num_feature_col,
                          feature_start_idx + (i + 1) * num_feature_col))
                for i in range(num_id)
            ]
            reindex_list = functools.reduce(lambda a, b: a + b, reindex_list)
            sorted_index = sorted(range(len(reindex_list)),
                                  key=reindex_list.__getitem__)
            self.numpy_x = self.numpy_x[:, :, sorted_index]

        # scaler index
        num_roll_target = len(self.roll_target)
        repeat_factor = len(self._id_list) if self.id_sensitive else 1
        scaler_index = [
            self.target_col.index(self.roll_target[i])
            for i in range(num_roll_target)
        ] * repeat_factor
        self.scaler_index = scaler_index

        return self