コード例 #1
0
def _create_norb(download_dir):
    '''Small NORB dataset from www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/ '''

    urlbase = "http://www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/"
    dst = os.path.join(download_dir, "raw")
    x_tr = _read_norb_data(
        download_file(urlbase, dst,
                      'smallnorb-5x46789x9x18x6x2x96x96-training-dat.mat.gz'))
    y_tr = _read_norb_data(
        download_file(urlbase, dst,
                      'smallnorb-5x46789x9x18x6x2x96x96-training-cat.mat.gz'))
    i_tr = _read_norb_data(
        download_file(urlbase, dst,
                      'smallnorb-5x46789x9x18x6x2x96x96-training-info.mat.gz'))
    x_te = _read_norb_data(
        download_file(urlbase, dst,
                      'smallnorb-5x01235x9x18x6x2x96x96-testing-dat.mat.gz'))
    y_te = _read_norb_data(
        download_file(urlbase, dst,
                      'smallnorb-5x01235x9x18x6x2x96x96-testing-cat.mat.gz'))

    # instead of assigning the validation set randomly, we pick one of the
    # "instances" of the training set. This is much better than doing
    # it randomly!
    fold = i_tr[:, 0].ravel()
    vi = (fold == 4)  # let's make instance 4 the validation-instance
    x_va, x_tr = x_tr[vi], x_tr[~vi]
    y_va, y_tr = y_tr[vi], y_tr[~vi]
    data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]]
    _process_and_store(data, os.path.join(download_dir, "norb.h5"))
コード例 #2
0
def _handle_larochelle_icml2007(download_dir,
                                fn,
                                train_data_file,
                                test_data_file,
                                rotate_images=True):
    '''Basic procedure to load the datasets from Larochelle et al., ICML 2007.
    Unfortunately the structure of the datasets differs sometimes,
    so we need this abstraction.

    fn = name of the zip file (w/o extension)
    train_data_file: name of the training set file within the archive
    test_data_file: name of the test set file within the archive
    rotate_images: rotate images (needed if file is in column-major format)
    '''
    import zipfile
    urlbase = "http://www.iro.umontreal.ca/~lisa/icml2007data/"
    dst = os.path.join(download_dir, "raw")
    f = download_file(urlbase, dst, '%s.zip' % fn)
    with zipfile.ZipFile(f) as zf:
        tmp = np.loadtxt(zf.open(train_data_file))
        x_tr, y_tr = tmp[:, :-1].copy(), tmp[:, -1].copy()
        tmp = np.loadtxt(zf.open(test_data_file))
        x_te, y_te = tmp[:, :-1].copy(), tmp[:, -1].copy()
        y_tr = y_tr.reshape((-1, 1))
        y_te = y_te.reshape((-1, 1))
        if rotate_images:
            n = int(np.sqrt(x_tr.shape[1]))
            x_tr = np.rollaxis(x_tr.reshape(x_tr.shape[0], n, n), 2, 1)
            x_tr = x_tr.reshape(-1, n * n)
            x_te = np.rollaxis(x_te.reshape(x_te.shape[0], n, n), 2, 1)
            x_te = x_te.reshape(-1, n * n)
        return x_tr, y_tr, x_te, y_te
コード例 #3
0
def create_cifar10(download_dir=_DATA_DIRECTORY):
    logger = logging.getLogger(__name__)
    logger.info('reading CIFAR10 data...')
    url = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
    fname = download_file(url, os.path.join(download_dir, "raw"))
    import tarfile
    with tarfile.open(fname) as tf:
        filemembers = tf.getmembers()
        files = [f.name for f in filemembers if "data_batch" in f.name]
        files.sort()

        def _read_file(fn):
            f = tf.extractfile(fn)
            tmp = np.frombuffer(f.read(), np.uint8).reshape(-1, 3073)
            return tmp[:, 0].reshape(-1, 1), tmp[:,
                                                 1:].reshape(-1, 3 * 32 * 32)

        # save last batch as validation
        traindata = [_read_file(fn) for fn in files[0:len(files) - 1]]
        y_tr = np.vstack([t[0] for t in traindata])
        x_tr = np.vstack([t[1] for t in traindata])

        y_va, x_va = _read_file(files[-1])
        y_te, x_te = _read_file('cifar-10-batches-bin/test_batch.bin')
        return x_tr, y_tr.ravel(), x_va, y_va.ravel(), x_te, y_te.ravel()
コード例 #4
0
def _create_mnist(download_dir):
    ''' MNIST dataset from yann.lecun.com/exdb/mnist/  '''
    from os.path import join
    logger = logging.getLogger(__name__)
    logger.info("reading data...")
    urlbase = 'http://yann.lecun.com/exdb/mnist/'
    files = [
        'train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
        't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz'
    ]
    destdir = join(download_dir, "raw")
    for fname in files:
        download_file(urlbase, destdir, fname)
    x_tr = _read_mnist_image(join(destdir, "train-images-idx3-ubyte.gz"))
    y_tr = _read_mnist_label(join(destdir, "train-labels-idx1-ubyte.gz"))
    x_te = _read_mnist_image(join(destdir, "t10k-images-idx3-ubyte.gz"))
    y_te = _read_mnist_label(join(destdir, "t10k-labels-idx1-ubyte.gz"))

    x_tr, y_tr, x_va, y_va = _split_dataset(x_tr, y_tr, 5 / 6.0)
    data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]]
    _process_and_store(data, join(download_dir, "mnist.h5"))
コード例 #5
0
def _create_covertype(download_dir):
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/'
    destdir = os.path.join(download_dir, "raw")
    fn = download_file(url, destdir, 'covtype.data.gz')
    import gzip
    import pandas as pd
    with gzip.open(fn, "rb") as gzfile:
        x = pd.read_csv(gzfile, header=None).values

    x, y = x[:, :-1].astype(np.float64), x[:, -1]
    y -= 1  # make classes 0-based

    # split into test- and validationset
    from sklearn.cross_validation import train_test_split
    x, x_te, y, y_te = train_test_split(x, y, test_size=0.1)  # create testset
    x_tr, x_va, y_tr, y_va = train_test_split(x, y, test_size=0.25)

    from sklearn.preprocessing import LabelBinarizer
    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    y_va = lb.transform(y_va)
    y_te = lb.transform(y_te)

    # Most values are binary, except for these, so let's standardize them
    quant_idx = [0, 1, 2, 3, 4, 5, 9]  # real numbers
    int_idx = [6, 7, 8]  # integers from [0, 255)
    idx = quant_idx + int_idx
    from sklearn.preprocessing import StandardScaler as Scaler
    scaler = Scaler()
    x_tr[:, idx] = scaler.fit_transform(x_tr[:, idx])
    x_va[:, idx] = scaler.transform(x_va[:, idx])
    x_te[:, idx] = scaler.transform(x_te[:, idx])
    data = [['train', x_tr, y_tr], ['valid', x_va, y_va], ['test', x_te, y_te]]
    m = np.zeros(x_tr.shape[1])
    m[quant_idx + int_idx] = scaler.mean_
    s = np.ones(x_tr.shape[1])
    s[quant_idx + int_idx] = scaler.std_
    other = {'center': m, "scale": s}
    _store(data, os.path.join(download_dir, "covertype.h5"), other)
コード例 #6
0
def create_tox21(sparsity_cutoff,
                 va_folds,
                 dtype=np.float32,
                 download_dir=_DATA_DIRECTORY):
    ''' Creates a preprocessed version of the tox21 dataset.
    va_folds is a list of folds that are to be put into the validation set.
    '''
    from scipy import io
    import pandas as pd
    urlbase = "http://www.bioinf.jku.at/research/deeptox/"
    dst = os.path.join(download_dir, "raw")
    fn_x_tr_d = download_file(urlbase, dst, 'tox21_dense_train.csv.gz')
    fn_x_tr_s = download_file(urlbase, dst, 'tox21_sparse_train.mtx.gz')
    fn_y_tr = download_file(urlbase, dst, 'tox21_labels_train.csv.gz')
    fn_x_te_d = download_file(urlbase, dst, 'tox21_dense_test.csv.gz')
    fn_x_te_s = download_file(urlbase, dst, 'tox21_sparse_test.mtx.gz')
    fn_y_te = download_file(urlbase, dst, 'tox21_labels_test.csv.gz')
    cpd = download_file(urlbase, dst, 'tox21_compoundData.csv')

    y_tr = pd.read_csv(fn_y_tr, index_col=0)
    y_te = pd.read_csv(fn_y_te, index_col=0)
    x_tr_dense = pd.read_csv(fn_x_tr_d, index_col=0).values
    x_te_dense = pd.read_csv(fn_x_te_d, index_col=0).values
    x_tr_sparse = io.mmread(fn_x_tr_s).tocsc()
    x_te_sparse = io.mmread(fn_x_te_s).tocsc()

    # filter out very sparse features
    sparse_col_idx = ((x_tr_sparse > 0).mean(0) >= sparsity_cutoff).A.ravel()
    x_tr_sparse = x_tr_sparse[:, sparse_col_idx].A
    x_te_sparse = x_te_sparse[:, sparse_col_idx].A

    # filter out low-variance features
    dense_col_idx = np.where(x_tr_dense.var(0) > 1e-6)[0]
    x_tr_dense = x_tr_dense[:, dense_col_idx]
    x_te_dense = x_te_dense[:, dense_col_idx]

    # handle very large and exponential features
    # (Note experimentally, this doesn't seem to make a difference)
    xm = np.minimum(x_tr_dense.min(0),
                    x_te_dense.min(0))  # avoid negative numbers
    log_x_tr = np.log10(x_tr_dense - xm + 1e-8)
    log_x_te = np.log10(x_te_dense - xm + 1e-8)
    exp_cols = np.where(x_tr_dense.ptp(0) > 10.0)
    x_tr_dense[:, exp_cols] = log_x_tr[:, exp_cols]
    x_te_dense[:, exp_cols] = log_x_te[:, exp_cols]

    # find the index of the validation items
    info = pd.read_csv(cpd, index_col=0)
    folds = info.CVfold[info.set != 'test'].values
    idx_va = np.zeros(folds.shape[0], np.bool)
    for fid in va_folds:
        idx_va |= (folds == float(fid))

    # normalize features
    from sklearn.preprocessing import StandardScaler, RobustScaler

    x_tr = np.hstack([x_tr_dense, x_tr_sparse])
    x_te = np.hstack([x_te_dense, x_te_sparse])

    s = RobustScaler()
    s.fit(x_tr[~idx_va])
    x_tr = s.transform(x_tr)
    x_te = s.transform(x_te)

    x_tr = np.tanh(x_tr)
    x_te = np.tanh(x_te)

    s = StandardScaler()
    s.fit(x_tr[~idx_va])
    x_tr = s.transform(x_tr)
    x_te = s.transform(x_te)

    return (x_tr[~idx_va].astype(dtype, order='C'),
            y_tr[~idx_va].values.astype(dtype, order='C'),
            x_tr[idx_va].astype(dtype, order='C'),
            y_tr[idx_va].values.astype(dtype, order='C'),
            x_te.astype(dtype, order='C'), y_te.values.astype(dtype,
                                                              order='C'))
コード例 #7
0
def _create_enwik8(download_dir):
    import pandas as pd
    '''Prepares the enwik8/hutter prize data: an extract from wikipedia.'''
    urlbase = 'http://mattmahoney.net/dc/'
    destdir = os.path.join(download_dir, "raw")
    fn = download_file(urlbase, destdir, 'enwik8.zip')

    # we first read the text as UTF-8, and then map each present character
    # to a number, instead of using UTF-8 bytes directly
    import zipfile
    with zipfile.ZipFile(fn, "r") as zf:
        with zf.open("enwik8") as z:
            text_train = z.read(96 * 10**6).decode("utf8")
            text_valid = z.read(2 * 10**6).decode("utf8")
            text_test = z.read(2 * 10**6).decode("utf8")
            assert (len(z.read()) == 0)  # make sure we read everything

    # ignore "uncommon" characters.
    # In "Generating Sequences With Recurrent Neural Networks"
    # Alex Graves says that there are 205 distinct single-byte characters.
    # However the following will only yield 196. No idea where Alex
    # got the rest of them ?-)
    dt = np.uint8
    data_tr = np.array([ord(c) for c in text_train if ord(c) < 256], dtype=dt)
    data_va = np.array([ord(c) for c in text_valid if ord(c) < 256], dtype=dt)
    data_te = np.array([ord(c) for c in text_test if ord(c) < 256], dtype=dt)
    cnt = pd.value_counts(data_tr)

    del (text_train, text_valid, text_test)
    import gc
    gc.collect()

    # remove characters with <=10 occourences (there are 16 of those)
    # (we use a lookup table, othewise it takes forever)
    count_loopup = np.zeros(256, np.int64)
    count_loopup[cnt.index.values] = cnt.values
    occ = count_loopup[data_tr]
    data_tr = data_tr[occ > 10]
    data_va = data_va[count_loopup[data_va] > 10]
    data_te = data_te[count_loopup[data_te] > 10]

    decode_lookup = 255 * np.ones(256, np.uint8)
    u = np.unique(data_tr)
    decode_lookup[:len(u)] = u
    encode_lookup = np.iinfo(np.uint16).max * np.ones(256, np.uint16)
    for c, e in enumerate(u):
        encode_lookup[e] = c
    code_tr = encode_lookup[data_tr]
    code_va = encode_lookup[data_va]
    code_te = encode_lookup[data_te]
    assert (np.all(decode_lookup[code_tr] == data_tr))
    assert (np.all(code_tr <= 255))
    assert (np.all(code_va <= 255))
    assert (np.all(code_te <= 255))
    del (data_tr, data_va, data_te)
    gc.collect()

    fname = os.path.join(download_dir, "enwik8.h5")
    with h5py.File(fname, "w") as f:
        f.create_dataset('train', data=code_tr)
        f.create_dataset('valid', data=code_va)
        f.create_dataset('test', data=code_te)
        f.create_dataset('encode', data=encode_lookup)
        f.create_dataset('decode', data=decode_lookup)