def load_data(self): """ Fetch the CIFAR-10 dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = self._valid_path_append(self.path, '', self.filename) batchdir = os.path.join(workdir, 'cifar-10-batches-py') if not os.path.exists(os.path.join(batchdir, 'data_batch_1')): if not os.path.exists(filepath): self.fetch_dataset(self.url, self.filename, filepath, self.size) with tarfile.open(filepath, 'r:gz') as f: f.extractall(workdir) train_batches = [ os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6) ] Xlist, ylist = [], [] for batch in train_batches: with open(batch, 'rb') as f: d = pickle_load(f) Xlist.append(d['data']) ylist.append(d['labels']) X_train = np.vstack(Xlist) y_train = np.vstack(ylist) with open(os.path.join(batchdir, 'test_batch'), 'rb') as f: d = pickle_load(f) X_test, y_test = d['data'], d['labels'] y_train = y_train.reshape(-1, 1) y_test = np.array(y_test).reshape(-1, 1) if self.contrast_normalize: norm_scale = 55.0 # Goodfellow X_train = self.global_contrast_normalize(X_train, scale=norm_scale) X_test = self.global_contrast_normalize(X_test, scale=norm_scale) if self.normalize: X_train = X_train / 255. X_test = X_test / 255. if self.whiten: zca_cache = os.path.join(workdir, 'cifar-10-zca-cache.pkl') X_train, X_test = self.zca_whiten(X_train, X_test, cache=zca_cache) return (X_train, y_train), (X_test, y_test), 10
def load_data(self): """ Fetch the CIFAR-10 dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ workdir, filepath = self._valid_path_append(self.path, '', self.filename) batchdir = os.path.join(workdir, 'cifar-10-batches-py') if not os.path.exists(os.path.join(batchdir, 'data_batch_1')): if not os.path.exists(filepath): self.fetch_dataset(self.url, self.filename, filepath, self.size) with tarfile.open(filepath, 'r:gz') as f: f.extractall(workdir) train_batches = [os.path.join(batchdir, 'data_batch_' + str(i)) for i in range(1, 6)] Xlist, ylist = [], [] for batch in train_batches: with open(batch, 'rb') as f: d = pickle_load(f) Xlist.append(d['data']) ylist.append(d['labels']) X_train = np.vstack(Xlist) y_train = np.vstack(ylist) with open(os.path.join(batchdir, 'test_batch'), 'rb') as f: d = pickle_load(f) X_test, y_test = d['data'], d['labels'] y_train = y_train.reshape(-1, 1) y_test = np.array(y_test).reshape(-1, 1) if self.contrast_normalize: norm_scale = 55.0 # Goodfellow X_train = self.global_contrast_normalize(X_train, scale=norm_scale) X_test = self.global_contrast_normalize(X_test, scale=norm_scale) if self.normalize: X_train = X_train / 255. X_test = X_test / 255. if self.whiten: zca_cache = os.path.join(workdir, 'cifar-10-zca-cache.pkl') X_train, X_test = self.zca_whiten(X_train, X_test, cache=zca_cache) return (X_train, y_train), (X_test, y_test), 10
def load_obj(load_path): """ Loads a saved on-disk representation to a python data structure. We currently support the following file formats: * python pickle (.pkl) Arguments: load_path (str): where to the load the serialized object (full path and file name) """ if isinstance(load_path, str): load_path = os.path.expandvars(os.path.expanduser(load_path)) if load_path.endswith('.gz'): import gzip load_path = gzip.open(load_path, 'rb') else: load_path = open(load_path, 'rb') fname = load_path.name logger.debug("deserializing object from: %s", fname) try: return pickle_load(load_path) except AttributeError: msg = ("Problems deserializing: %s. Its possible the interface " "for this object has changed since being serialized. You " "may need to remove and recreate it." % load_path) logger.error(msg) raise AttributeError(msg)
def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ filepath = self._valid_path_append(self.path, self.filename) if not os.path.exists(filepath): self.fetch_dataset(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as mnist: (X_train, y_train), (X_test, y_test) = pickle_load(mnist) X_train = X_train.reshape(-1, 784) X_test = X_test.reshape(-1, 784) if self.normalize: X_train = X_train / 255. X_test = X_test / 255. return (X_train, y_train), (X_test, y_test), 10
def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ filepath = self._valid_path_append(self.path, self.filename) if not os.path.exists(filepath): self.fetch_dataset(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as mnist: (X_train, y_train), (X_test, y_test) = pickle_load(mnist) if self.subset_pct < 100: X_train = X_train[:int(X_train.shape[0] * self.subset_pct / 100.)] y_train = y_train[:int(y_train.shape[0] * self.subset_pct / 100.)] X_test = X_test[:int(X_test.shape[0] * self.subset_pct / 100.)] y_test = y_test[:int(y_test.shape[0] * self.subset_pct / 100.)] logger.debug("subset %d%% of data", self.subset_pct * 100) if self.size > 28: n_train, n_test = X_train.shape[0], X_test.shape[0] X_train_ = np.zeros(shape=(n_train, self.size, self.size)) X_test_ = np.zeros(shape=(n_test, self.size, self.size)) X_train_[:, :28, :28] = X_train X_test_[:, :28, :28] = X_test else: X_train_ = X_train[:, :self.size, :self.size] X_test_ = X_test[:, :self.size, :self.size] X_train = X_train_.reshape(-1, self.size * self.size) X_test = X_test_.reshape(-1, self.size * self.size) if self.normalize: X_train = X_train / 255. X_test = X_test / 255. if self.sym_range: X_train = X_train * 2. - 1. X_test = X_test * 2. - 1. if self.shuffle: np.random.seed(0) np.random.shuffle(X_train) return (X_train, y_train), (X_test, y_test), 10
def load_data(self): """ Fetch the MNIST dataset and load it into memory. Arguments: path (str, optional): Local directory in which to cache the raw dataset. Defaults to current directory. normalize (bool, optional): Whether to scale values between 0 and 1. Defaults to True. Returns: tuple: Both training and test sets are returned. """ filepath = self._valid_path_append(self.path, self.filename) if not os.path.exists(filepath): self.fetch_dataset(self.url, self.filename, filepath, self.size) with gzip.open(filepath, 'rb') as mnist: (X_train, y_train), (X_test, y_test) = pickle_load(mnist) if self.subset_pct < 100: X_train = X_train[:int(X_train.shape[0] * self.subset_pct / 100.)] y_train = y_train[:int(y_train.shape[0] * self.subset_pct / 100.)] X_test = X_test[:int(X_test.shape[0] * self.subset_pct / 100.)] y_test = y_test[:int(y_test.shape[0] * self.subset_pct / 100.)] logger.debug("subset %d%% of data", self.subset_pct*100) if self.size > 28: n_train, n_test = X_train.shape[0], X_test.shape[0] X_train_ = np.zeros(shape=(n_train, self.size, self.size)) X_test_ = np.zeros(shape=(n_test, self.size, self.size)) X_train_[:, :28, :28] = X_train X_test_[:, :28, :28] = X_test else: X_train_ = X_train[:, :self.size, :self.size] X_test_ = X_test[:, :self.size, :self.size] X_train = X_train_.reshape(-1, self.size*self.size) X_test = X_test_.reshape(-1, self.size*self.size) if self.normalize: X_train = X_train / 255. X_test = X_test / 255. if self.sym_range: X_train = X_train * 2. - 1. X_test = X_test * 2. - 1. if self.shuffle: np.random.seed(0) np.random.shuffle(X_train) return (X_train, y_train), (X_test, y_test), 10
def zca_whiten(train, test, cache=None): """ Use train set statistics to apply the ZCA whitening transform to both train and test sets. """ if cache and os.path.isfile(cache): with open(cache, 'rb') as f: (meanX, W) = pickle_load(f) else: meanX, W = CIFAR10._compute_zca_transform(train) if cache: logger.info("Caching ZCA transform matrix") with open(cache, 'wb') as f: pickle.dump((meanX, W), f, 2) logger.info("Applying ZCA whitening transform") train_w = np.dot(train - meanX, W) test_w = np.dot(test - meanX, W) return train_w, test_w