Example #1
0
    def with_new_start_times(
            self, start_times: Union[np.ndarray,
                                     Sequence]) -> 'TimeSeriesDataset':
        """
        Subset a TimeSeriesDataset so that some/all of the groups have later start times.

        :param start_times: An array/list of new datetimes.
        :return: A new TimeSeriesDataset.
        """
        new_tensors = []
        for i, tens in enumerate(self.tensors):
            times = self.times(i)
            new_tens = []
            for g, (new_time, old_times) in enumerate(zip(start_times, times)):
                if (old_times <= new_time).all():
                    raise ValueError(
                        f"{new_time} is later than all the times for group {self.group_names[g]}"
                    )
                elif (old_times > new_time).all():
                    raise ValueError(
                        f"{new_time} is earlier than all the times for group {self.group_names[g]}"
                    )
                # drop if before new_time:
                g_tens = tens[g, true1d_idx(old_times >= new_time), :]
                # drop if after last nan:
                all_nan, _ = torch.min(torch.isnan(g_tens), 1)
                end_idx = true1d_idx(~all_nan).max() + 1
                new_tens.append(g_tens[:end_idx].unsqueeze(0))
            new_tens = ragged_cat(new_tens, ragged_dim=1, cat_dim=0)
            new_tensors.append(new_tens)
        return type(self)(*new_tensors,
                          group_names=self.group_names,
                          start_times=start_times,
                          measures=self.measures,
                          dt_unit=self.dt_unit)
Example #2
0
    def tensor_to_dataframe(tensor: Tensor,
                            times: np.ndarray,
                            group_names: Sequence,
                            group_colname: str,
                            time_colname: str,
                            measures: Sequence[str]) -> 'DataFrame':
        from pandas import DataFrame, concat

        tensor = tensor.data.numpy()
        assert tensor.shape[0] == len(group_names)
        assert tensor.shape[0] == len(times)
        assert tensor.shape[1] <= times.shape[1]
        assert tensor.shape[2] == len(measures)

        dfs = []
        for g, group_name in enumerate(group_names):
            # get values, don't store trailing nans:
            values = tensor[g]
            all_nan_per_row = np.min(np.isnan(values), axis=1)
            if all_nan_per_row.all():
                warn(f"Group {group_name} has only missing values.")
                continue
            end_idx = true1d_idx(~all_nan_per_row).max() + 1
            # convert to dataframe:
            df = DataFrame(data=values[:end_idx, :], columns=measures)
            df[group_colname] = group_name
            df[time_colname] = np.nan
            df[time_colname] = times[g, 0:len(df.index)]
            dfs.append(df)

        return concat(dfs)
Example #3
0
 def get_groups(self, groups: Sequence[Any]) -> 'TimeSeriesDataset':
     """
     Get the subset of the batch corresponding to groups. Note that the ordering in the output will match the
     original ordering (not that of `group`), and that duplicates will be dropped.
     """
     group_idx = true1d_idx(np.isin(self.group_names, groups))
     return self[group_idx]
Example #4
0
    def train_val_split(
        self,
        train_frac: float = None,
        dt: Union[np.datetime64, dict] = None
    ) -> Tuple['TimeSeriesDataset', 'TimeSeriesDataset']:
        """
        :param train_frac: The proportion of the data to keep for training. This is calculated on a per-group basis, by
        taking the last observation for each group (i.e., the last observation that a non-nan value on any measure). If
        neither `train_frac` nor `dt` are passed, `train_frac=.75` is used.
        :param dt: A datetime to use in dividing train/validation (first datetime for validation), or a dictionary of
        group-names : date-times.
        :return: Two TimeSeriesDatasets, one with data before the split, the other with >= the split.
        """

        # get split times:
        if dt is None:
            if train_frac is None:
                train_frac = .75
            assert 0 < train_frac < 1
            # for each group, find the last non-nan, take `frac` of that to find the train/val split point:
            split_idx = np.array(
                [int(idx * train_frac) for idx in self._last_measured_idx()],
                dtype='int')
            _times = self.times(0)
            split_times = np.array(
                [_times[i, t] for i, t in enumerate(split_idx)])
        else:
            if train_frac is not None:
                raise TypeError("Can pass only one of `train_frac`, `dt`.")
            if isinstance(dt, dict):
                split_times = np.array(
                    [dt[group_name] for group_name in self.group_names],
                    dtype='datetime64[ns]')
            else:
                if not isinstance(dt, np.datetime64):
                    dt = np.datetime64(dt, self.dt_unit)
                split_times = np.full(shape=len(self.group_names),
                                      fill_value=dt)

        # val:
        val_dataset = self.with_new_start_times(split_times)

        # train:
        train_tensors = []
        for i, tens in enumerate(self.tensors):
            train = tens.clone()
            train[np.where(
                self.times(i) >= split_times[:, None])] = float('nan')
            if i == 0:
                not_all_nan = (~torch.isnan(train)).sum((0, 2))
                last_good_idx = true1d_idx(not_all_nan).max()
            train = train[:, :(last_good_idx + 1), :]
            train_tensors.append(train)
        # TODO: replace padding nans for all but first tensor?
        # TODO: reduce width of 0> tensors based on width of 0 tensor?
        train_dataset = self.with_new_tensors(*train_tensors)

        return train_dataset, val_dataset
Example #5
0
 def _last_measured_idx(self) -> np.ndarray:
     """
     :return: The indices of the last measurement in the first tensor, where a measurement is any non-nan value in at
      least on dimension.
     """
     tens, *_ = self.tensors
     any_measured_bool = ~np.isnan(tens.numpy()).all(2)
     last_measured_idx = np.array(
         [np.max(true1d_idx(any_measured_bool[g]), initial=0) for g in range(len(self.group_names))],
         dtype='int'
     )
     return last_measured_idx