def _create_norb(download_dir): '''Small NORB dataset from www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/ ''' urlbase = "http://www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/" dst = os.path.join(download_dir, "raw") x_tr = _read_norb_data( download_file(urlbase, dst, 'smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat.gz')) y_tr = _read_norb_data( download_file(urlbase, dst, 'smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat.gz')) i_tr = _read_norb_data( download_file(urlbase, dst, 'smallnorb-5x46789x9x18x6x2x96x96-training-info.mat.gz')) x_te = _read_norb_data( download_file(urlbase, dst, 'smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat.gz')) y_te = _read_norb_data( download_file(urlbase, dst, 'smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat.gz')) # instead of assigning the validation set randomly, we pick one of the # "instances" of the training set. This is much better than doing # it randomly! fold = i_tr[:, 0].ravel() vi = (fold == 4) # let's make instance 4 the validation-instance x_va, x_tr = x_tr[vi], x_tr[~vi] y_va, y_tr = y_tr[vi], y_tr[~vi] data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]] _process_and_store(data, os.path.join(download_dir, "norb.h5"))
def _handle_larochelle_icml2007(download_dir, fn, train_data_file, test_data_file, rotate_images=True): '''Basic procedure to load the datasets from Larochelle et al., ICML 2007. Unfortunately the structure of the datasets differs sometimes, so we need this abstraction. fn = name of the zip file (w/o extension) train_data_file: name of the training set file within the archive test_data_file: name of the test set file within the archive rotate_images: rotate images (needed if file is in column-major format) ''' import zipfile urlbase = "http://www.iro.umontreal.ca/~lisa/icml2007data/" dst = os.path.join(download_dir, "raw") f = download_file(urlbase, dst, '%s.zip' % fn) with zipfile.ZipFile(f) as zf: tmp = np.loadtxt(zf.open(train_data_file)) x_tr, y_tr = tmp[:, :-1].copy(), tmp[:, -1].copy() tmp = np.loadtxt(zf.open(test_data_file)) x_te, y_te = tmp[:, :-1].copy(), tmp[:, -1].copy() y_tr = y_tr.reshape((-1, 1)) y_te = y_te.reshape((-1, 1)) if rotate_images: n = int(np.sqrt(x_tr.shape[1])) x_tr = np.rollaxis(x_tr.reshape(x_tr.shape[0], n, n), 2, 1) x_tr = x_tr.reshape(-1, n * n) x_te = np.rollaxis(x_te.reshape(x_te.shape[0], n, n), 2, 1) x_te = x_te.reshape(-1, n * n) return x_tr, y_tr, x_te, y_te
def create_cifar10(download_dir=_DATA_DIRECTORY): logger = logging.getLogger(__name__) logger.info('reading CIFAR10 data...') url = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz' fname = download_file(url, os.path.join(download_dir, "raw")) import tarfile with tarfile.open(fname) as tf: filemembers = tf.getmembers() files = [f.name for f in filemembers if "data_batch" in f.name] files.sort() def _read_file(fn): f = tf.extractfile(fn) tmp = np.frombuffer(f.read(), np.uint8).reshape(-1, 3073) return tmp[:, 0].reshape(-1, 1), tmp[:, 1:].reshape(-1, 3 * 32 * 32) # save last batch as validation traindata = [_read_file(fn) for fn in files[0:len(files) - 1]] y_tr = np.vstack([t[0] for t in traindata]) x_tr = np.vstack([t[1] for t in traindata]) y_va, x_va = _read_file(files[-1]) y_te, x_te = _read_file('cifar-10-batches-bin/test_batch.bin') return x_tr, y_tr.ravel(), x_va, y_va.ravel(), x_te, y_te.ravel()
def _create_mnist(download_dir): ''' MNIST dataset from yann.lecun.com/exdb/mnist/ ''' from os.path import join logger = logging.getLogger(__name__) logger.info("reading data...") urlbase = 'http://yann.lecun.com/exdb/mnist/' files = [ 'train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz' ] destdir = join(download_dir, "raw") for fname in files: download_file(urlbase, destdir, fname) x_tr = _read_mnist_image(join(destdir, "train-images-idx3-ubyte.gz")) y_tr = _read_mnist_label(join(destdir, "train-labels-idx1-ubyte.gz")) x_te = _read_mnist_image(join(destdir, "t10k-images-idx3-ubyte.gz")) y_te = _read_mnist_label(join(destdir, "t10k-labels-idx1-ubyte.gz")) x_tr, y_tr, x_va, y_va = _split_dataset(x_tr, y_tr, 5 / 6.0) data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]] _process_and_store(data, join(download_dir, "mnist.h5"))
def _create_covertype(download_dir): url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/' destdir = os.path.join(download_dir, "raw") fn = download_file(url, destdir, 'covtype.data.gz') import gzip import pandas as pd with gzip.open(fn, "rb") as gzfile: x = pd.read_csv(gzfile, header=None).values x, y = x[:, :-1].astype(np.float64), x[:, -1] y -= 1 # make classes 0-based # split into test- and validationset from sklearn.cross_validation import train_test_split x, x_te, y, y_te = train_test_split(x, y, test_size=0.1) # create testset x_tr, x_va, y_tr, y_va = train_test_split(x, y, test_size=0.25) from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() y = lb.fit_transform(y) y_va = lb.transform(y_va) y_te = lb.transform(y_te) # Most values are binary, except for these, so let's standardize them quant_idx = [0, 1, 2, 3, 4, 5, 9] # real numbers int_idx = [6, 7, 8] # integers from [0, 255) idx = quant_idx + int_idx from sklearn.preprocessing import StandardScaler as Scaler scaler = Scaler() x_tr[:, idx] = scaler.fit_transform(x_tr[:, idx]) x_va[:, idx] = scaler.transform(x_va[:, idx]) x_te[:, idx] = scaler.transform(x_te[:, idx]) data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]] m = np.zeros(x_tr.shape[1]) m[quant_idx + int_idx] = scaler.mean_ s = np.ones(x_tr.shape[1]) s[quant_idx + int_idx] = scaler.std_ other = {'center': m, "scale": s} _store(data, os.path.join(download_dir, "covertype.h5"), other)
def create_tox21(sparsity_cutoff, va_folds, dtype=np.float32, download_dir=_DATA_DIRECTORY): ''' Creates a preprocessed version of the tox21 dataset. va_folds is a list of folds that are to be put into the validation set. ''' from scipy import io import pandas as pd urlbase = "http://www.bioinf.jku.at/research/deeptox/" dst = os.path.join(download_dir, "raw") fn_x_tr_d = download_file(urlbase, dst, 'tox21_dense_train.csv.gz') fn_x_tr_s = download_file(urlbase, dst, 'tox21_sparse_train.mtx.gz') fn_y_tr = download_file(urlbase, dst, 'tox21_labels_train.csv.gz') fn_x_te_d = download_file(urlbase, dst, 'tox21_dense_test.csv.gz') fn_x_te_s = download_file(urlbase, dst, 'tox21_sparse_test.mtx.gz') fn_y_te = download_file(urlbase, dst, 'tox21_labels_test.csv.gz') cpd = download_file(urlbase, dst, 'tox21_compoundData.csv') y_tr = pd.read_csv(fn_y_tr, index_col=0) y_te = pd.read_csv(fn_y_te, index_col=0) x_tr_dense = pd.read_csv(fn_x_tr_d, index_col=0).values x_te_dense = pd.read_csv(fn_x_te_d, index_col=0).values x_tr_sparse = io.mmread(fn_x_tr_s).tocsc() x_te_sparse = io.mmread(fn_x_te_s).tocsc() # filter out very sparse features sparse_col_idx = ((x_tr_sparse > 0).mean(0) >= sparsity_cutoff).A.ravel() x_tr_sparse = x_tr_sparse[:, sparse_col_idx].A x_te_sparse = x_te_sparse[:, sparse_col_idx].A # filter out low-variance features dense_col_idx = np.where(x_tr_dense.var(0) > 1e-6)[0] x_tr_dense = x_tr_dense[:, dense_col_idx] x_te_dense = x_te_dense[:, dense_col_idx] # handle very large and exponential features # (Note experimentally, this doesn't seem to make a difference) xm = np.minimum(x_tr_dense.min(0), x_te_dense.min(0)) # avoid negative numbers log_x_tr = np.log10(x_tr_dense - xm + 1e-8) log_x_te = np.log10(x_te_dense - xm + 1e-8) exp_cols = np.where(x_tr_dense.ptp(0) > 10.0) x_tr_dense[:, exp_cols] = log_x_tr[:, exp_cols] x_te_dense[:, exp_cols] = log_x_te[:, exp_cols] # find the index of the validation items info = pd.read_csv(cpd, index_col=0) folds = info.CVfold[info.set != 'test'].values idx_va = np.zeros(folds.shape[0], np.bool) for fid in va_folds: idx_va |= (folds == float(fid)) # normalize features from sklearn.preprocessing import StandardScaler, RobustScaler x_tr = np.hstack([x_tr_dense, x_tr_sparse]) x_te = np.hstack([x_te_dense, x_te_sparse]) s = RobustScaler() s.fit(x_tr[~idx_va]) x_tr = s.transform(x_tr) x_te = s.transform(x_te) x_tr = np.tanh(x_tr) x_te = np.tanh(x_te) s = StandardScaler() s.fit(x_tr[~idx_va]) x_tr = s.transform(x_tr) x_te = s.transform(x_te) return (x_tr[~idx_va].astype(dtype, order='C'), y_tr[~idx_va].values.astype(dtype, order='C'), x_tr[idx_va].astype(dtype, order='C'), y_tr[idx_va].values.astype(dtype, order='C'), x_te.astype(dtype, order='C'), y_te.values.astype(dtype, order='C'))
def _create_enwik8(download_dir): import pandas as pd '''Prepares the enwik8/hutter prize data: an extract from wikipedia.''' urlbase = 'http://mattmahoney.net/dc/' destdir = os.path.join(download_dir, "raw") fn = download_file(urlbase, destdir, 'enwik8.zip') # we first read the text as UTF-8, and then map each present character # to a number, instead of using UTF-8 bytes directly import zipfile with zipfile.ZipFile(fn, "r") as zf: with zf.open("enwik8") as z: text_train = z.read(96 * 10**6).decode("utf8") text_valid = z.read(2 * 10**6).decode("utf8") text_test = z.read(2 * 10**6).decode("utf8") assert (len(z.read()) == 0) # make sure we read everything # ignore "uncommon" characters. # In "Generating Sequences With Recurrent Neural Networks" # Alex Graves says that there are 205 distinct single-byte characters. # However the following will only yield 196. No idea where Alex # got the rest of them ?-) dt = np.uint8 data_tr = np.array([ord(c) for c in text_train if ord(c) < 256], dtype=dt) data_va = np.array([ord(c) for c in text_valid if ord(c) < 256], dtype=dt) data_te = np.array([ord(c) for c in text_test if ord(c) < 256], dtype=dt) cnt = pd.value_counts(data_tr) del (text_train, text_valid, text_test) import gc gc.collect() # remove characters with <=10 occourences (there are 16 of those) # (we use a lookup table, othewise it takes forever) count_loopup = np.zeros(256, np.int64) count_loopup[cnt.index.values] = cnt.values occ = count_loopup[data_tr] data_tr = data_tr[occ > 10] data_va = data_va[count_loopup[data_va] > 10] data_te = data_te[count_loopup[data_te] > 10] decode_lookup = 255 * np.ones(256, np.uint8) u = np.unique(data_tr) decode_lookup[:len(u)] = u encode_lookup = np.iinfo(np.uint16).max * np.ones(256, np.uint16) for c, e in enumerate(u): encode_lookup[e] = c code_tr = encode_lookup[data_tr] code_va = encode_lookup[data_va] code_te = encode_lookup[data_te] assert (np.all(decode_lookup[code_tr] == data_tr)) assert (np.all(code_tr <= 255)) assert (np.all(code_va <= 255)) assert (np.all(code_te <= 255)) del (data_tr, data_va, data_te) gc.collect() fname = os.path.join(download_dir, "enwik8.h5") with h5py.File(fname, "w") as f: f.create_dataset('train', data=code_tr) f.create_dataset('valid', data=code_va) f.create_dataset('test', data=code_te) f.create_dataset('encode', data=encode_lookup) f.create_dataset('decode', data=decode_lookup)