def __init__(self, df, lookback, horizon, feature_col, target_col, id_col=None): """ A customized TorchDataset for rolling dataframe for time series applications. :param df: The dataframe to roll on. The dataframe could contain single id value or multiple id values. If the dataframe contains multiple ids, the rows of same id should be consecutive. And dataframe should have been ordered by timestamp for each id. :param lookback: the length of the past sequence :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is an list, we will sample discretely according to the input list. 1 means the timestamp just after the observed data. :param feature_col: list, indicate the feature col name. :param target_col: list, indicate the target col name. :param id_col: (optional) a str indicates the col name of dataframe id :return: """ df.reset_index(drop=True, inplace=True) feature_col = _to_list(feature_col, "feature_col") target_col = _to_list(target_col, "target_col") _check_cols_no_na(df, col_names=target_col + feature_col) cols = target_col + feature_col cols = cols[0] if len(cols) == 1 else cols self.arr = df.loc[:, cols].to_numpy() self.arr = np.expand_dims(self.arr, axis=1) if self.arr.ndim == 1 else self.arr max_horizon = horizon if isinstance(horizon, int) else max(horizon) window_size = lookback + max_horizon self.roll_start_idxes = get_roll_start_idx(df, id_col, window_size=window_size) self.lookback = lookback self.horizon = horizon self.target_num = len(target_col)
def roll(self, lookback, horizon, feature_col=None, target_col=None, id_sensitive=False): ''' Sampling by rolling for machine learning/deep learning models. :param lookback: int, lookback value. :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is a list, we will sample discretely according to the input list. specially, when `horizon` is set to 0, ground truth will be generated as None. :param feature_col: str or list, indicates the feature col name. Default to None, where we will take all available feature in rolling. :param target_col: str or list, indicates the target col name. Default to None, where we will take all target in rolling. it should be a subset of target_col you used to initialize the xshardtsdataset. :param id_sensitive: bool, |if `id_sensitive` is False, we will rolling on each id's sub dataframe |and fuse the sampings. |The shape of rolling will be |x: (num_sample, lookback, num_feature_col + num_target_col) |y: (num_sample, horizon, num_target_col) |where num_sample is the summation of sample number of each dataframe |if `id_sensitive` is True, we have not implement this currently. :return: the xshardtsdataset instance. ''' if id_sensitive: raise NotImplementedError( "id_sensitive option has not been implemented.") feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \ else self.feature_col target_col = _to_list(target_col, "target_col") if target_col is not None \ else self.target_col self.numpy_shards = self.shards.transform_shard( roll_timeseries_dataframe, None, lookback, horizon, feature_col, target_col) return self
def from_pandas(df, dt_col, target_col, id_col=None, extra_feature_col=None, with_split=False, val_ratio=0, test_ratio=0.1, largest_look_back=0, largest_horizon=1): ''' Initialize tsdataset(s) from pandas dataframe. :param df: a pandas dataframe for your raw time series data. :param dt_col: a str indicates the col name of datetime column in the input data frame. :param target_col: a str or list indicates the col name of target column in the input data frame. :param id_col: (optional) a str indicates the col name of dataframe id. If it is not explicitly stated, then the data is interpreted as only containing a single id. :param extra_feature_col: (optional) a str or list indicates the col name of extra feature columns that needs to predict the target column. :param with_split: (optional) bool, states if we need to split the dataframe to train, validation and test set. The value defaults to False. :param val_ratio: (optional) float, validation ratio. Only effective when with_split is set to True. The value defaults to 0. :param test_ratio: (optional) float, test ratio. Only effective when with_split is set to True. The value defaults to 0.1. :param largest_look_back: (optional) int, the largest length to look back. Only effective when with_split is set to True. The value defaults to 0. :param largest_horizon: (optional) int, the largest num of steps to look forward. Only effective when with_split is set to True. The value defaults to 1. :return: a TSDataset instance when with_split is set to False, three TSDataset instances when with_split is set to True. Create a tsdataset instance by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) ''' _check_type(df, "df", pd.DataFrame) tsdataset_df = df.copy(deep=True) target_col = _to_list(target_col, name="target_col") feature_col = _to_list(extra_feature_col, name="extra_feature_col") if id_col is None: tsdataset_df[_DEFAULT_ID_COL_NAME] = _DEFAULT_ID_PLACEHOLDER id_col = _DEFAULT_ID_COL_NAME if with_split: tsdataset_dfs = split_timeseries_dataframe( df=tsdataset_df, id_col=id_col, val_ratio=val_ratio, test_ratio=test_ratio, look_back=largest_look_back, horizon=largest_horizon) return [ TSDataset(data=tsdataset_dfs[i], id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col) for i in range(3) ] return TSDataset(data=tsdataset_df, id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col)
def to_torch_data_loader( self, batch_size=32, roll=False, lookback=None, horizon=None, feature_col=None, target_col=None, ): """ Convert TSDataset to a PyTorch DataLoader with or without rolling. We recommend to use to_torch_data_loader(roll=True) if you don't need to output the rolled numpy array. It is much more efficient than rolling separately, especially when the dataframe or lookback is large. :param batch_size: int, the batch_size for a Pytorch DataLoader. It defaults to 32. :param roll: Boolean. Whether to roll the dataframe before converting to DataLoader. If True, you must also specify lookback and horizon for rolling. If False, you must have called tsdataset.roll() before calling to_torch_data_loader(). Default to False. :param lookback: int, lookback value. :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is a list, we will sample discretely according to the input list. specially, when `horizon` is set to 0, ground truth will be generated as None. :param feature_col: str or list, indicates the feature col name. Default to None, where we will take all available feature in rolling. :param target_col: str or list, indicates the target col name. Default to None, where we will take all target in rolling. it should be a subset of target_col you used to initialize the tsdataset. :return: A pytorch DataLoader instance. to_torch_data_loader() can be called by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) >>> horizon, lookback = 1, 1 >>> data_loader = tsdataset.to_torch_data_loader(batch_size=32, >>> roll=True, >>> lookback=lookback, >>> horizon=horizon) >>> # or roll outside. That might be less efficient than the way above. >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False) >>> x, y = tsdataset.to_numpy() >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]] >>> data_loader = tsdataset.to_torch_data_loader(batch_size=32) """ from torch.utils.data import TensorDataset, DataLoader import torch if roll: if lookback is None: raise ValueError("You must input lookback if roll is True") if horizon is None: raise ValueError("You must input horizon if roll is True") from zoo.chronos.data.utils.roll_dataset import RollDataset feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \ else self.feature_col target_col = _to_list(target_col, "target_col") if target_col is not None \ else self.target_col # set scaler index for unscale_numpy self.scaler_index = [self.target_col.index(t) for t in target_col] torch_dataset = RollDataset(self.df, lookback=lookback, horizon=horizon, feature_col=feature_col, target_col=target_col, id_col=self.id_col) return DataLoader(torch_dataset, batch_size=batch_size, shuffle=True) else: if self.numpy_x is None: raise RuntimeError( "Please call \"roll\" method before transforming a TSDataset to " "torch DataLoader without rolling (default roll=False)!") x, y = self.to_numpy() return DataLoader(TensorDataset( torch.from_numpy(x).float(), torch.from_numpy(y).float()), batch_size=batch_size, shuffle=True)
def roll(self, lookback, horizon, feature_col=None, target_col=None, id_sensitive=False): ''' Sampling by rolling for machine learning/deep learning models. :param lookback: int, lookback value. :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is a list, we will sample discretely according to the input list. specially, when `horizon` is set to 0, ground truth will be generated as None. :param feature_col: str or list, indicates the feature col name. Default to None, where we will take all available feature in rolling. :param target_col: str or list, indicates the target col name. Default to None, where we will take all target in rolling. it should be a subset of target_col you used to initialize the tsdataset. :param id_sensitive: bool, if `id_sensitive` is False, we will rolling on each id's sub dataframe and fuse the sampings. The shape of rolling will be x: (num_sample, lookback, num_feature_col + num_target_col) y: (num_sample, horizon, num_target_col) where num_sample is the summation of sample number of each dataframe if `id_sensitive` is True, we will rolling on the wide dataframe whose columns are cartesian product of id_col and feature_col The shape of rolling will be x: (num_sample, lookback, new_num_feature_col + new_num_target_col) y: (num_sample, horizon, new_num_target_col) where num_sample is the sample number of the wide dataframe, new_num_feature_col is the product of the number of id and the number of feature_col. new_num_target_col is the product of the number of id and the number of target_col. :return: the tsdataset instance. roll() can be called by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) >>> horizon, lookback = 1, 1 >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False) >>> x, y = tsdataset.to_numpy() >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]] >>> print(x.shape, y.shape) # x.shape = (2, 1, 3) y.shape = (2, 1, 1) >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=True) >>> x, y = tsdataset.to_numpy() >>> print(x, y) # x = [[[ 1.9, 2.3, 1, 2, 0, 9 ]]] y = [[[ 2.4, 2.6]]] >>> print(x.shape, y.shape) # x.shape = (1, 1, 6) y.shape = (1, 1, 2) ''' feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \ else self.feature_col target_col = _to_list(target_col, "target_col") if target_col is not None \ else self.target_col if self.roll_additional_feature: additional_feature_col =\ list(set(feature_col).intersection(set(self.roll_additional_feature))) feature_col =\ list(set(feature_col) - set(self.roll_additional_feature)) self.roll_feature = feature_col + additional_feature_col else: additional_feature_col = None self.roll_feature = feature_col self.roll_target = target_col num_id = len(self._id_list) num_feature_col = len(self.roll_feature) num_target_col = len(self.roll_target) self.id_sensitive = id_sensitive roll_feature_df = None if self.roll_feature_df is None \ else self.roll_feature_df[additional_feature_col] rolling_result = \ self.df.groupby([self.id_col]) \ .apply(lambda df: roll_timeseries_dataframe(df=df, roll_feature_df=roll_feature_df, lookback=lookback, horizon=horizon, feature_col=feature_col, target_col=target_col)) # concat the result on required axis concat_axis = 2 if id_sensitive else 0 self.numpy_x = np.concatenate( [rolling_result[i][0] for i in self._id_list], axis=concat_axis).astype(np.float32) if horizon != 0: self.numpy_y = np.concatenate( [rolling_result[i][1] for i in self._id_list], axis=concat_axis).astype(np.float32) else: self.numpy_y = None # target first if self.id_sensitive: feature_start_idx = num_target_col * num_id reindex_list = [ list(range(i * num_target_col, (i + 1) * num_target_col)) + list( range(feature_start_idx + i * num_feature_col, feature_start_idx + (i + 1) * num_feature_col)) for i in range(num_id) ] reindex_list = functools.reduce(lambda a, b: a + b, reindex_list) sorted_index = sorted(range(len(reindex_list)), key=reindex_list.__getitem__) self.numpy_x = self.numpy_x[:, :, sorted_index] # scaler index num_roll_target = len(self.roll_target) repeat_factor = len(self._id_list) if self.id_sensitive else 1 scaler_index = [ self.target_col.index(self.roll_target[i]) for i in range(num_roll_target) ] * repeat_factor self.scaler_index = scaler_index return self
def from_parquet(path, dt_col, target_col, id_col=None, extra_feature_col=None, with_split=False, val_ratio=0, test_ratio=0.1, largest_look_back=0, largest_horizon=1, **kwargs): """ Initialize tsdataset(s) from path of parquet file. :param path: A string path to parquet file. The string could be a URL. Valid URL schemes include hdfs, http, ftp, s3, gs, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.parquet. A file URL can also be a path to a directory that contains multiple partitioned parquet files. :param dt_col: a str indicates the col name of datetime column in the input data frame. :param target_col: a str or list indicates the col name of target column in the input data frame. :param id_col: (optional) a str indicates the col name of dataframe id. If it is not explicitly stated, then the data is interpreted as only containing a single id. :param extra_feature_col: (optional) a str or list indicates the col name of extra feature columns that needs to predict the target column. :param with_split: (optional) bool, states if we need to split the dataframe to train, validation and test set. The value defaults to False. :param val_ratio: (optional) float, validation ratio. Only effective when with_split is set to True. The value defaults to 0. :param test_ratio: (optional) float, test ratio. Only effective when with_split is set to True. The value defaults to 0.1. :param largest_look_back: (optional) int, the largest length to look back. Only effective when with_split is set to True. The value defaults to 0. :param largest_horizon: (optional) int, the largest num of steps to look forward. Only effective when with_split is set to True. The value defaults to 1. :param kwargs: Any additional kwargs are passed to the pd.read_parquet and pyarrow.parquet.read_table. :return: a TSDataset instance when with_split is set to False, three TSDataset instances when with_split is set to True. Create a tsdataset instance by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_parquet("hdfs://path/to/table.parquet", dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) """ from zoo.chronos.data.utils.file import parquet2pd columns = _to_list(dt_col, name="dt_col") + \ _to_list(target_col, name="target_col") + \ _to_list(id_col, name="id_col") + \ _to_list(extra_feature_col, name="extra_feature_col") df = parquet2pd(path, columns=columns, **kwargs) return TSDataset.from_pandas( df, dt_col=dt_col, target_col=target_col, id_col=id_col, extra_feature_col=extra_feature_col, with_split=with_split, val_ratio=val_ratio, test_ratio=test_ratio, largest_look_back=largest_look_back, largest_horizon=largest_horizon, )
def from_xshards(shards, dt_col, target_col, id_col=None, extra_feature_col=None, with_split=False, val_ratio=0, test_ratio=0.1, largest_look_back=0, largest_horizon=1): ''' Initialize xshardtsdataset(s) from xshard pandas dataframe. :param shards: an xshards pandas dataframe for your raw time series data. :param dt_col: a str indicates the col name of datetime column in the input data frame. :param target_col: a str or list indicates the col name of target column in the input data frame. :param id_col: (optional) a str indicates the col name of dataframe id. If it is not explicitly stated, then the data is interpreted as only containing a single id. :param extra_feature_col: (optional) a str or list indicates the col name of extra feature columns that needs to predict the target column. :param with_split: (optional) bool, states if we need to split the dataframe to train, validation and test set. The value defaults to False. :param val_ratio: (optional) float, validation ratio. Only effective when with_split is set to True. The value defaults to 0. :param test_ratio: (optional) float, test ratio. Only effective when with_split is set to True. The value defaults to 0.1. :param largest_look_back: (optional) int, the largest length to look back. Only effective when with_split is set to True. The value defaults to 0. :param largest_horizon: (optional) int, the largest num of steps to look forward. Only effective when with_split is set to True. The value defaults to 1. :return: a XShardTSDataset instance when with_split is set to False, three XShardTSDataset instances when with_split is set to True. Create a xshardtsdataset instance by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> from zoo.orca.data.pandas import read_csv >>> shards = read_csv(csv_path) >>> tsdataset = XShardsTSDataset.from_xshards(shards, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) ''' _check_type(shards, "shards", SparkXShards) target_col = _to_list(target_col, name="target_col") feature_col = _to_list(extra_feature_col, name="extra_feature_col") if id_col is None: shards = shards.transform_shard(add_row, _DEFAULT_ID_COL_NAME, _DEFAULT_ID_PLACEHOLDER) id_col = _DEFAULT_ID_COL_NAME # repartition to id shards = shards.partition_by(cols=id_col, num_partitions=len( shards[id_col].unique())) if with_split: tsdataset_shards\ = shards.transform_shard(split_timeseries_dataframe, id_col, val_ratio, test_ratio, largest_look_back, largest_horizon).split() return [ XShardsTSDataset(shards=tsdataset_shards[i], id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col) for i in range(3) ] return XShardsTSDataset(shards=shards, id_col=id_col, dt_col=dt_col, target_col=target_col, feature_col=feature_col)