def __init__( self, train_df, categorical_input: List, numerical_input: List, target: str, valid_df=None, test_df=None, batch_size=2, num_workers: Optional[int] = None, ): dfs = [train_df] self._test_df = None if valid_df is not None: dfs.append(valid_df) if test_df is not None: # save for predict function self._test_df = test_df.copy() self._test_df.drop(target, axis=1) dfs.append(test_df) # impute missing values dfs = _impute(dfs, numerical_input) # compute train dataset stats self.mean, self.std = _compute_normalization(dfs[0], numerical_input) if dfs[0][target].dtype == object: # if the target is a category, not an int self.target_codes = _generate_codes(dfs, [target]) else: self.target_codes = None self.codes = _generate_codes(dfs, categorical_input) dfs = _pre_transform(dfs, numerical_input, categorical_input, self.codes, self.mean, self.std, target, self.target_codes) # normalize self.cat_cols = categorical_input self.num_cols = numerical_input self._num_classes = len(train_df[target].unique()) train_ds = PandasDataset(dfs[0], categorical_input, numerical_input, target) valid_ds = PandasDataset(dfs[1], categorical_input, numerical_input, target) if valid_df is not None else None test_ds = PandasDataset(dfs[-1], categorical_input, numerical_input, target) if test_df is not None else None super().__init__(train_ds, valid_ds, test_ds, batch_size=batch_size, num_workers=num_workers)
def before_collate(self, samples: Any) -> Any: """Override to apply transformations to samples""" if _contains_any_tensor(samples, dtype=(Tensor, np.ndarray)): return samples if isinstance(samples, str): samples = pd.read_csv(samples) if isinstance(samples, DataFrame): samples = [samples] dfs = _pre_transform( samples, self._numerical_input, self._categorical_input, self._codes, self._mean, self._std ) return _dfs_to_samples(dfs, self._categorical_input, self._numerical_input)
def common_load_data(self, df: DataFrame, dataset: AutoDataset): # impute_data # compute train dataset stats dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, self.target_codes) df = dfs[0] dataset.num_samples = len(df) cat_vars = _to_cat_vars_numpy(df, self.cat_cols) num_vars = _to_num_vars_numpy(df, self.num_cols) cat_vars = np.stack(cat_vars, 1) if len(cat_vars) else np.zeros((len(self), 0)) num_vars = np.stack(num_vars, 1) if len(num_vars) else np.zeros((len(self), 0)) return df, cat_vars, num_vars