Example #1
0
    def __init__(
        self,
        train_df,
        categorical_input: List,
        numerical_input: List,
        target: str,
        valid_df=None,
        test_df=None,
        batch_size=2,
        num_workers: Optional[int] = None,
    ):
        dfs = [train_df]
        self._test_df = None

        if valid_df is not None:
            dfs.append(valid_df)

        if test_df is not None:
            # save for predict function
            self._test_df = test_df.copy()
            self._test_df.drop(target, axis=1)
            dfs.append(test_df)

        # impute missing values
        dfs = _impute(dfs, numerical_input)

        # compute train dataset stats
        self.mean, self.std = _compute_normalization(dfs[0], numerical_input)

        if dfs[0][target].dtype == object:
            # if the target is a category, not an int
            self.target_codes = _generate_codes(dfs, [target])
        else:
            self.target_codes = None

        self.codes = _generate_codes(dfs, categorical_input)

        dfs = _pre_transform(dfs, numerical_input, categorical_input,
                             self.codes, self.mean, self.std, target,
                             self.target_codes)

        # normalize
        self.cat_cols = categorical_input
        self.num_cols = numerical_input

        self._num_classes = len(train_df[target].unique())

        train_ds = PandasDataset(dfs[0], categorical_input, numerical_input,
                                 target)
        valid_ds = PandasDataset(dfs[1], categorical_input, numerical_input,
                                 target) if valid_df is not None else None
        test_ds = PandasDataset(dfs[-1], categorical_input, numerical_input,
                                target) if test_df is not None else None
        super().__init__(train_ds,
                         valid_ds,
                         test_ds,
                         batch_size=batch_size,
                         num_workers=num_workers)
Example #2
0
 def before_collate(self, samples: Any) -> Any:
     """Override to apply transformations to samples"""
     if _contains_any_tensor(samples, dtype=(Tensor, np.ndarray)):
         return samples
     if isinstance(samples, str):
         samples = pd.read_csv(samples)
     if isinstance(samples, DataFrame):
         samples = [samples]
     dfs = _pre_transform(
         samples, self._numerical_input, self._categorical_input, self._codes, self._mean, self._std
     )
     return _dfs_to_samples(dfs, self._categorical_input, self._numerical_input)
Example #3
0
    def common_load_data(self, df: DataFrame, dataset: AutoDataset):
        # impute_data
        # compute train dataset stats
        dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col,
                             self.target_codes)

        df = dfs[0]

        dataset.num_samples = len(df)
        cat_vars = _to_cat_vars_numpy(df, self.cat_cols)
        num_vars = _to_num_vars_numpy(df, self.num_cols)

        cat_vars = np.stack(cat_vars, 1) if len(cat_vars) else np.zeros((len(self), 0))
        num_vars = np.stack(num_vars, 1) if len(num_vars) else np.zeros((len(self), 0))
        return df, cat_vars, num_vars