def data(): fname = _download('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz') with _taropen(fname, 'r') as f: # The first four batches are used as training set... datas, labels = [], [] for i in range(1, 5): with f.extractfile('cifar-10-batches-py/data_batch_' + str(i)) as b: batch = _pickle.load(b, encoding='latin1') datas.append(_np.array(batch['data'], dtype=_np.float32)) labels.append(_np.array(batch['labels'])) Xtr = _np.concatenate(datas) ytr = _np.concatenate(labels) Xtr /= 255 # ... and the fifth as validation set as described in cuda-convnet: # https://code.google.com/p/cuda-convnet/wiki/Methodology with f.extractfile('cifar-10-batches-py/data_batch_5') as b: batch = _pickle.load(b, encoding='latin1') Xva = _np.array(batch['data'], dtype=_np.float32) yva = _np.array(batch['labels']) Xva /= 255 with f.extractfile('cifar-10-batches-py/test_batch') as b: batch = _pickle.load(b, encoding='latin1') Xte = _np.array(batch['data'], dtype=_np.float32) yte = _np.array(batch['labels']) Xte /= 255 return (Xtr, ytr), (Xva, yva), (Xte, yte)
def data(fold=False): fname = df.zoo.download('http://dags.stanford.edu/data/iccv09Data.tar.gz') # extracting files one-by-one in memory is unfortunately WAY too slow # for this dataset. So we bite the bullet and extract the full tgz. where = _p.dirname(fname) imgdir = 'iccv09Data/images/' with _taropen(fname, 'r') as f: f.extractall(where) ids = [_p.basename(n)[:-4] for n in f.getnames() if n.startswith(imgdir)] X = [imread(_p.join(where, imgdir, i) + '.jpg') for i in ids] y = [_np.loadtxt(_p.join(where, 'iccv09Data/labels', i) + '.regions.txt', dtype=_np.int32) for i in ids] # I personally don't believe in the other label types. le = _np.array(['sky', 'tree', 'road', 'grass', 'water', 'building', 'mountain', 'foreground', 'object']) try: from sklearn.preprocessing import LabelEncoder le, classes = LabelEncoder(), le le.classes_ = classes except ImportError: pass if fold is False: return X, y, le lo, hi = fold*ntest(), (fold+1)*ntest() Xtr = X[:lo] + X[hi:] ytr = y[:lo] + y[hi:] Xte = X[lo:hi] yte = y[lo:hi] return (Xtr, ytr), (Xte, yte), le
def data(): fname = _download( 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz') with _taropen(fname, 'r') as f: with f.extractfile('cifar-100-python/train') as train: train = _pickle.load(train, encoding='latin1') Xtr = _np.array(train['data'], dtype=_np.float32) ytr_c = _np.array(train['coarse_labels']) ytr_f = _np.array(train['fine_labels']) Xtr /= 255 # There is no "official" validation set here that I know of! # But the maxout paper uses the last 10k samples as validation. Xtr, Xva = Xtr[:-10000], Xtr[-10000:] ytr_c, yva_c = ytr_c[:-10000], ytr_c[-10000:] ytr_f, yva_f = ytr_f[:-10000], ytr_f[-10000:] with f.extractfile('cifar-100-python/test') as test: test = _pickle.load(test, encoding='latin1') Xte = _np.array(test['data'], dtype=_np.float32) yte_c = _np.array(test['coarse_labels']) yte_f = _np.array(test['fine_labels']) Xte /= 255 # Get the label names additionally. with f.extractfile('cifar-100-python/meta') as m: m = _pickle.load(m, encoding='latin1') try: from sklearn.preprocessing import LabelEncoder le_c = LabelEncoder() le_c.classes_ = _np.array(m['coarse_label_names']) le_f = LabelEncoder() le_f.classes_ = _np.array(m['fine_label_names']) except ImportError: le_c = _np.array(m['coarse_label_names']) le_f = _np.array(m['fine_label_names']) return (Xtr, ytr_c, ytr_f), (Xva, yva_c, yva_f), (Xte, yte_c, yte_f), (le_c, le_f)