def load_data(self): """ Fetch the CIFAR-10 dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) batchdir = os.path.join(workdir, 'cifar-10-batches-py') if not os.path.exists(os.path.join(batchdir, 'data_batch_1')): assert os.path.exists(filepath), "Must have cifar-10-python.tar.gz" with tarfile.open(filepath, 'r:gz') as f: f.extractall(workdir) train_batches = [os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6)] Xlist, ylist = [], [] for batch in train_batches: with open(batch, 'rb') as f: d = pickle_load(f) Xlist.append(d['data']) ylist.append(d['labels']) X_train = np.vstack(Xlist).reshape(-1, 3, 32, 32) y_train = np.vstack(ylist).ravel() with open(os.path.join(batchdir, 'test_batch'), 'rb') as f: d = pickle_load(f) X_test, y_test = d['data'], d['labels'] X_test = X_test.reshape(-1, 3, 32, 32) self.train_set = {'image': {'data': X_train, 'axes': ('batch', 'channel', 'height', 'width')}, 'label': {'data': y_train, 'axes': ('batch',)}} self.valid_set = {'image': {'data': X_test, 'axes': ('batch', 'channel', 'height', 'width')}, 'label': {'data': np.array(y_test), 'axes': ('batch',)}} return self.train_set, self.valid_set
def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as f: self.train_set, self.valid_set = pickle_load(f) self.train_set = {'image': {'data': self.train_set[0].reshape(60000, 28, 28), 'axes': ('batch', 'height', 'width')}, 'label': {'data': self.train_set[1], 'axes': ('batch',)}} self.valid_set = {'image': {'data': self.valid_set[0].reshape(10000, 28, 28), 'axes': ('batch', 'height', 'width')}, 'label': {'data': self.valid_set[1], 'axes': ('batch',)}} return self.train_set, self.valid_set
def load_data(self, test_split=0.2): self.data_dict = {} self.vocab = None workdir, filepath = valid_path_append(self.path, '', self.filename) if not os.path.exists(filepath): fetch_file(self.url, self.filename, filepath, self.filesize) with open(filepath, 'rb') as f: X, y = pickle_load(f) X = preprocess_text(X, self.vocab_size) X = pad_sentences(X, pad_idx=self.pad_idx, pad_to_len=self.sentence_length, pad_from='left') if self.shuffle: indices = np.arange(len(y)) np.random.shuffle(indices) X = X[indices] y = np.asarray(y)[indices] # split the data X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] y_train = np.array(y_train) y_test = np.array(y_test) self.nclass = 1 + max(np.max(y_train), np.max(y_test)) self.data_dict['train'] = { 'review': { 'data': X_train, 'axes': ('batch', 'REC') }, 'label': { 'data': y_train, 'axes': ('batch', ) } } self.data_dict['valid'] = { 'review': { 'data': X_test, 'axes': ('batch', 'REC') }, 'label': { 'data': y_test, 'axes': ('batch', ) } } return self.data_dict