Esempio n. 1
0
    def load_data(self):
        """
        Fetch the CIFAR-10 dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.
            normalize (bool, optional): Whether to scale values between 0 and 1.
                                        Defaults to True.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        batchdir = os.path.join(workdir, 'cifar-10-batches-py')
        if not os.path.exists(os.path.join(batchdir, 'data_batch_1')):
            assert os.path.exists(filepath), "Must have cifar-10-python.tar.gz"
            with tarfile.open(filepath, 'r:gz') as f:
                f.extractall(workdir)

        train_batches = [os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6)]
        Xlist, ylist = [], []
        for batch in train_batches:
            with open(batch, 'rb') as f:
                d = pickle_load(f)
                Xlist.append(d['data'])
                ylist.append(d['labels'])

        X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32)
        y_train = np.vstack(ylist).ravel()

        with open(os.path.join(batchdir, 'test_batch'), 'rb') as f:
            d = pickle_load(f)
            X_test, y_test = d['data'], d['labels']
            X_test = X_test.reshape(-1, 3, 32, 32)

        self.train_set = {'image': {'data': X_train,
                                    'axes': ('batch', 'channel', 'height', 'width')},
                          'label': {'data': y_train,
                                    'axes': ('batch',)}}
        self.valid_set = {'image': {'data': X_test,
                                    'axes': ('batch', 'channel', 'height', 'width')},
                          'label': {'data': np.array(y_test),
                                    'axes': ('batch',)}}

        return self.train_set, self.valid_set
Esempio n. 2
0
    def load_data(self):
        """
        Fetch the MNIST dataset and load it into memory.

        Arguments:
            path (str, optional): Local directory in which to cache the raw
                                  dataset.  Defaults to current directory.

        Returns:
            tuple: Both training and test sets are returned.
        """
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.size)

        with gzip.open(filepath, 'rb') as f:
            self.train_set, self.valid_set = pickle_load(f)

        self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28),
                                    'axes': ('batch', 'height', 'width')},
                          'label': {'data': self.train_set[1],
                                    'axes': ('batch',)}}
        self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28),
                                    'axes': ('batch', 'height', 'width')},
                          'label': {'data': self.valid_set[1],
                                    'axes': ('batch',)}}

        return self.train_set, self.valid_set
Esempio n. 3
0
    def load_data(self, test_split=0.2):
        self.data_dict = {}
        self.vocab = None
        workdir, filepath = valid_path_append(self.path, '', self.filename)
        if not os.path.exists(filepath):
            fetch_file(self.url, self.filename, filepath, self.filesize)

        with open(filepath, 'rb') as f:
            X, y = pickle_load(f)

        X = preprocess_text(X, self.vocab_size)
        X = pad_sentences(X,
                          pad_idx=self.pad_idx,
                          pad_to_len=self.sentence_length,
                          pad_from='left')

        if self.shuffle:
            indices = np.arange(len(y))
            np.random.shuffle(indices)
            X = X[indices]
            y = np.asarray(y)[indices]

        # split the data
        X_train = X[:int(len(X) * (1 - test_split))]
        y_train = y[:int(len(X) * (1 - test_split))]

        X_test = X[int(len(X) * (1 - test_split)):]
        y_test = y[int(len(X) * (1 - test_split)):]

        y_train = np.array(y_train)
        y_test = np.array(y_test)

        self.nclass = 1 + max(np.max(y_train), np.max(y_test))

        self.data_dict['train'] = {
            'review': {
                'data': X_train,
                'axes': ('batch', 'REC')
            },
            'label': {
                'data': y_train,
                'axes': ('batch', )
            }
        }
        self.data_dict['valid'] = {
            'review': {
                'data': X_test,
                'axes': ('batch', 'REC')
            },
            'label': {
                'data': y_test,
                'axes': ('batch', )
            }
        }
        return self.data_dict