def load_training_data(self): logging.info('Loading training/validation datasets...') postfixed_dataset_name = self.dataset_name + '_preprocessed' s3_bucket = None if self.infra_s3 is None else self.infra_s3[ 's3_bucket'] s3_folder = None if self.infra_s3 is None else self.infra_s3[ 's3_folder_data'] training, validation, _, _ = load_preproc_dataset( postfixed_dataset_name, self.data_dir, s3_bucket, s3_folder, self.clean, True, self.target) self._init_features(training) self.training = training self.validation = validation self.training_x = self.training[self.features].values[:] self.validation_x = self.validation[self.features].values[:] self.training_y = np.ravel(self.training[self.target]) self.validation_y = np.ravel(self.validation[self.target]) self.n_classes = len(set(self.training_y)) logging.info('training_x shape: {}'.format(self.training_x.shape)) logging.info('training_y shape: {}'.format(self.training_y.shape)) logging.info('validation_x shape: {}'.format(self.validation_x.shape)) logging.info('validation_y shape: {}'.format(self.validation_y.shape))
def load_testing_data(self): logging.info('Loading testing dataset...') postfixed_dataset_name = self.dataset_name + '_preprocessed' _, _, testing, _ = load_preproc_dataset( postfixed_dataset_name, self.data_dir, self.infra_s3['s3_bucket'], self.infra_s3['s3_folder_data'], self.clean, True, self.target) self.testing = testing self._init_features(testing) self.testing_x = self.testing[self.features].values[:] self.testing_y = np.ravel(self.testing[self.target]) logging.info('testing_x shape: {}'.format(self.testing_x.shape)) logging.info('testing_y shape: {}'.format(self.testing_y.shape))
def load_training_data(self): # if self.training is not None and self.validation is not None and self.testing is not None: # return logging.info('Loading preprocessed datasets...') postfixed_dataset_name = self.dataset_name + '_preprocessed' s3_bucket = None if self.infra_s3 is None else self.infra_s3[ 's3_bucket'] s3_folder = None if self.infra_s3 is None else self.infra_s3[ 's3_folder_data'] training, validation, testing, self.feature_types = load_preproc_dataset( postfixed_dataset_name, self.data_dir, s3_bucket, s3_folder, self.clean, False, self.target) self._init_features(training) self.feature_types = { feature: _type for feature, _type in self.feature_types.items() if feature in self.features + [self.target] } self.training = training self.validation = validation self.testing = testing self.training_y = np.ravel(self.training[self.target]) self.validation_y = np.ravel(self.validation[self.target]) self.testing_y = np.ravel(self.testing[self.target]) self.n_classes = len(set(self.training_y)) logging.info('training_y shape: {}\nClass distribution: {}'.format( self.training_y.shape, Counter(self.training_y).most_common())) logging.info('validation_y shape: {}\nClass distribution: {}'.format( self.validation_y.shape, Counter(self.validation_y).most_common())) logging.info('testing_y shape: {}\nClass distribution: {}'.format( self.testing_y.shape, Counter(self.testing_y).most_common()))