Esempio n. 1
0
    def load_training_data(self):
        logging.info('Loading training/validation datasets...')
        postfixed_dataset_name = self.dataset_name + '_preprocessed'
        s3_bucket = None if self.infra_s3 is None else self.infra_s3[
            's3_bucket']
        s3_folder = None if self.infra_s3 is None else self.infra_s3[
            's3_folder_data']
        training, validation, _, _ = load_preproc_dataset(
            postfixed_dataset_name, self.data_dir, s3_bucket, s3_folder,
            self.clean, True, self.target)
        self._init_features(training)
        self.training = training
        self.validation = validation

        self.training_x = self.training[self.features].values[:]
        self.validation_x = self.validation[self.features].values[:]

        self.training_y = np.ravel(self.training[self.target])
        self.validation_y = np.ravel(self.validation[self.target])
        self.n_classes = len(set(self.training_y))

        logging.info('training_x shape: {}'.format(self.training_x.shape))
        logging.info('training_y shape: {}'.format(self.training_y.shape))
        logging.info('validation_x shape: {}'.format(self.validation_x.shape))
        logging.info('validation_y shape: {}'.format(self.validation_y.shape))
Esempio n. 2
0
    def load_testing_data(self):
        logging.info('Loading testing dataset...')
        postfixed_dataset_name = self.dataset_name + '_preprocessed'
        _, _, testing, _ = load_preproc_dataset(
            postfixed_dataset_name, self.data_dir, self.infra_s3['s3_bucket'],
            self.infra_s3['s3_folder_data'], self.clean, True, self.target)
        self.testing = testing
        self._init_features(testing)
        self.testing_x = self.testing[self.features].values[:]
        self.testing_y = np.ravel(self.testing[self.target])

        logging.info('testing_x shape: {}'.format(self.testing_x.shape))
        logging.info('testing_y shape: {}'.format(self.testing_y.shape))
Esempio n. 3
0
    def load_training_data(self):
        # if self.training is not None and self.validation is not None and self.testing is not None:
        #     return

        logging.info('Loading preprocessed datasets...')
        postfixed_dataset_name = self.dataset_name + '_preprocessed'
        s3_bucket = None if self.infra_s3 is None else self.infra_s3[
            's3_bucket']
        s3_folder = None if self.infra_s3 is None else self.infra_s3[
            's3_folder_data']
        training, validation, testing, self.feature_types = load_preproc_dataset(
            postfixed_dataset_name, self.data_dir, s3_bucket, s3_folder,
            self.clean, False, self.target)
        self._init_features(training)
        self.feature_types = {
            feature: _type
            for feature, _type in self.feature_types.items()
            if feature in self.features + [self.target]
        }
        self.training = training
        self.validation = validation
        self.testing = testing

        self.training_y = np.ravel(self.training[self.target])
        self.validation_y = np.ravel(self.validation[self.target])
        self.testing_y = np.ravel(self.testing[self.target])
        self.n_classes = len(set(self.training_y))

        logging.info('training_y shape: {}\nClass distribution: {}'.format(
            self.training_y.shape,
            Counter(self.training_y).most_common()))
        logging.info('validation_y shape: {}\nClass distribution: {}'.format(
            self.validation_y.shape,
            Counter(self.validation_y).most_common()))
        logging.info('testing_y shape: {}\nClass distribution: {}'.format(
            self.testing_y.shape,
            Counter(self.testing_y).most_common()))