Example #1
0
def arxiv(root, ids, new_name=None):
    """Download paper from https://arxiv.org, the file format is pdf.
    
    Data storage directory:
    root = `/user/.../mydata`
    `ids`.pdf data: 
    `root/arxiv/`ids`.pdf` or `root/arxiv/`new_name`.pdf`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/arxiv`,
              root should be `/user/.../mydata`.
        ids: str, arxiv paper id.
             example:ids = '1605.09782' mean you want get paper links https://arxiv.org/abs/1605.09782.
        new_name: str, default None. if not None, download file path is `root/arxiv/new_name.pdf`.
    Returns:
        Store the absolute path of the data directory, is `root/arxiv`.
    """
    start = time.time()
    assert gfile.isdir(root), '`root` should be directory.'
    assert isinstance(ids, str), '`ids` type should be str.'
    if new_name is None:
        task_path = gfile.path_join(root, 'arxiv', ids + '.pdf')
    else:
        task_path = gfile.path_join(root, 'arxiv', new_name + '.pdf')
    gfile.makedirs(gfile.path_join(root, 'arxiv'))
    gfile.remove(task_path)
    url = 'https://arxiv.org/pdf/' + str(ids) + '.pdf'
    rq.files(url, task_path)
    print('arxiv paper download completed, run time %d min %.2f sec' % divmod(
        (time.time() - start), 60))
    return task_path
Example #2
0
def mnist_kuzushiji_kanji(root):
    """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset 
    of 3832 Kanji characters, containing 140,426 images 
    of both common and rare characters.
    
    Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji_kanji data: 
    `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+9593/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False)
    url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar"
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(gfile.path_join(root, url.split('/')[-1]), task_path)
    gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train'))
    gfile.remove(gfile.path_join(root, 'kkanji.tar'))
    print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #3
0
def mnist_fashion(root):
    """A dataset of Zalando's article images consisting of fashion products.
    
    Fashion mnist datasets is a drop-in replacement of the original MNIST dataset
    from https://github.com/zalandoresearch/fashion-mnist.
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_fashion`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_fashion data: 
    `root/mnist_fashion/train/0/xx.png`
    `root/mnist_fashion/train/2/xx.png`
    `root/mnist_fashion/train/6/xx.png`
    `root/mnist_fashion/test/0/xx.png`
    `root/mnist_fashion/test/2/xx.png`
    `root/mnist_fashion/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_fashion`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_fashion`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_fashion')
    url_list = ['http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_fashion dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #4
0
def cifar100(root, fine_label=True):
    """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html
    
    Each sample is an image (in 3D NDArray) with shape (32, 32, 3).
    
    Attention: if exist dirs `root/cifar100`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    cifar100 data: 
    `root/cifar100/train/0/xx.png`
    `root/cifar100/train/2/xx.png`
    `root/cifar100/train/6/xx.png`
    `root/cifar100/test/0/xx.png`
    `root/cifar100/test/2/xx.png`
    `root/cifar100/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/cifar100`,
              root should be `/user/.../mydata`.
        fine_label: bool, default False.
                    Whether to load the fine-grained (100 classes) or 
                    coarse-grained (20 super-classes) labels.
    Returns:
        Store the absolute path of the data directory, is `root/cifar100`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'cifar100')
    url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar100/cifar-100-binary.tar.gz'
    rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with tarfile.open(gfile.path_join(task_path, url.split('/')[-1])) as t:
        t.extractall(task_path)
    noise_flie = gfile.listdir(task_path)
    with open(gfile.path_join(task_path, 'train.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        train = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        train_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(train.shape[0]):
        save_image(
            gfile.path_join(task_path, 'train', str(train_label[idx]),
                            str(idx) + '.png'), array_to_image(train[idx]))
    with open(gfile.path_join(task_path, 'test.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        test = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        test_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(test.shape[0]):
        save_image(
            gfile.path_join(task_path, 'test', str(test_label[idx]),
                            str(idx) + '.png'), array_to_image(test[idx]))
    for file in noise_flie:
        gfile.remove(gfile.path_join(task_path, file))
    print('cifar100 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Example #5
0
def mnist(root):
    """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist data: 
    `root/mnist/train/0/xx.png`
    `root/mnist/train/2/xx.png`
    `root/mnist/train/6/xx.png`
    `root/mnist/test/0/xx.png`
    `root/mnist/test/2/xx.png`
    `root/mnist/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist')
    url_list = ['https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #6
0
def mnist_kuzushiji10(root):
    """Kuzushiji-MNIST from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-MNIST is a drop-in replacement for the
    MNIST dataset (28x28 grayscale, 70,000 images), 
    provided in the original MNIST format as well as a NumPy format.
    Since MNIST restricts us to 10 classes, we chose one character to
    represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji10 data: 
    `root/mnist_kuzushiji10/train/0/xx.png`
    `root/mnist_kuzushiji10/train/2/xx.png`
    `root/mnist_kuzushiji10/train/6/xx.png`
    `root/mnist_kuzushiji10/test/0/xx.png`
    `root/mnist_kuzushiji10/test/2/xx.png`
    `root/mnist_kuzushiji10/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji10`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji10')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'kmnist-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'kmnist-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'kmnist-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'kmnist-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #7
0
def mnist_kuzushiji49(root):
    """Kuzushiji-49 from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-49, as the name suggests, has 49 classes (28x28 grayscale, 270,912 images),
    is a much larger, but imbalanced dataset containing 48 Hiragana 
    characters and one Hiragana iteration mark.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji49`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji49 data: 
    `root/mnist_kuzushiji49/train/0/xx.png`
    `root/mnist_kuzushiji49/train/2/xx.png`
    `root/mnist_kuzushiji49/train/6/xx.png`
    `root/mnist_kuzushiji49/test/0/xx.png`
    `root/mnist_kuzushiji49/test/2/xx.png`
    `root/mnist_kuzushiji49/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji49`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji49`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji49')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'k49-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'k49-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'k49-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'k49-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji49 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #8
0
def assert_dirs(root, root_dir=None, delete=True, make_root_dir=True):
    assert gfile.isdir(root), '{} should be directory.'.format(root)
    if root_dir is not None:
        assert isinstance(root_dir, str), '{} should be str.'.format(root_dir)
        task_path = gfile.path_join(root, root_dir)
        if gfile.exists(task_path):
            if delete:
                gfile.remove(task_path)
                gfile.makedirs(task_path)
        else:
            if make_root_dir:
                gfile.makedirs(task_path)
        return task_path
    else:
        if not gfile.exists(root):
            gfile.makedirs(root)
        return root
Example #9
0
def mnist_kannada(root):
    """kannada-MNIST from https://github.com/vinayprabhu/Kannada_MNIST.
    
    The Kannada-MNIST dataset was created an a drop-in substitute for the standard MNIST dataset.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kannada`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kannada data: 
    `root/mnist_kannada/train/0/xx.png`
    `root/mnist_kannada/train/2/xx.png`
    `root/mnist_kannada/train/6/xx.png`
    `root/mnist_kannada/test/0/xx.png`
    `root/mnist_kannada/test/2/xx.png`
    `root/mnist_kannada/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kannada`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kannada`.
    """
    start = time.time()
    print('Downloading data from https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip')
    task_path = assert_dirs(root, 'mnist_kannada')
    zip_path = rq.files('https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip', task_path+'/kannada_MNIST.zip')
    unzip_path = un_zip(task_path+'/kannada_MNIST.zip')
    train = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_train.csv'), header=None, dtype='uint8')
    test = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_test.csv'), header=None, dtype='uint8')
    for i in set(train[0]):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for i in range(len(train)):
        save_image(gfile.path_join(task_path, 'train', str(train.iat[i, 0]), str(i)+'.png'),
                       array_to_image(train.iloc[i, 1:].values.reshape(28, 28, 1)))
    for i in range(len(test)):
        save_image(gfile.path_join(task_path, 'test', str(test.iat[i, 0]), str(i)+'.png'),
                       array_to_image(test.iloc[i, 1:].values.reshape(28, 28, 1)))
    gfile.remove(zip_path)
    gfile.remove(unzip_path)
    print('mnist_kannada dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #10
0
def coil20(root):
    """COIL20 dataset from http://www.cs.columbia.edu/CAVE/software/softlib/coil-20.php
    
    "Columbia Object Image Library (COIL-20)," 
    S. A. Nene, S. K. Nayar and H. Murase,
    Technical Report CUCS-005-96, February 1996.
    
    Each sample is an gray image (in 3D NDArray) with shape (128, 128, 1).
    Attention: if exist dirs `root/coil20`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    coil20 data: 
    `root/coil20/train/0/xx.png`
    `root/coil20/train/2/xx.png`
    `root/coil20/train/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/coil20`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/coil20`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'coil20')
    url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip"
    rq.files(url, gfile.path_join(task_path, 'coil20.zip'))
    un_zip(gfile.path_join(task_path, 'coil20.zip'))
    image = gfile.listdir(gfile.path_join(task_path, 'coil20', 'coil-20-proc'))
    t = pd.DataFrame(image, columns=['image'])
    t['label'] = t.image.map(lambda x:x.split('__')[0][3:])
    t['image_old_path'] = t.image.map(lambda x:gfile.path_join(task_path, 'coil20', 'coil-20-proc', x))
    t['image_new_path'] = (t.label+'/'+t.image).map(lambda x:gfile.path_join(task_path, 'train', x))
    for i in t.label.unique():
        gfile.makedirs(gfile.path_join(task_path, 'train', i))
    for i,j in zip(t.image_old_path, t.image_new_path):
        gfile.copy(i, j)
    gfile.remove(gfile.path_join(task_path, 'coil20.zip'))
    gfile.remove(gfile.path_join(task_path, 'coil20'))
    print('coil20 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #11
0
def caltech101(root):
    """Caltech101 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech101
    
    Pictures of objects belonging to 101 categories. 
    About 40 to 800 images per category.
    Most categories have about 50 images. 
    Collected in September 2003 by Fei-Fei Li, Marco Andreetto, 
    and Marc 'Aurelio Ranzato.  
    The size of each image is roughly 300 x 200 pixels.

    We have carefully clicked outlines of each object in these pictures, 
    these are included under the 'Annotations.tar'.
    There is also a matlab script to view the annotaitons, 'show_annotations.m'.
    
    Attention: if exist dirs `root/caltech101`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    caltech101 data: 
    `root/caltech101/train/accordion/xx.jpg`
    `root/caltech101/train/brain/xx.ipg`
    `root/caltech101/train/panda/xx.jpg`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/caltech101`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/caltech101.
    """
    start = time.time()
    task_path = assert_dirs(root, 'caltech101', make_root_dir=False)
    url = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz'
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(un_gz(gfile.path_join(root, url.split('/')[-1])), task_path)
    gfile.rename(gfile.path_join(task_path, '101_ObjectCategories'),
                 gfile.path_join(task_path, 'train'))
    for i in ['101_ObjectCategories.tar.gz', '101_ObjectCategories.tar']:
        gfile.remove(gfile.path_join(root, i))
    print('caltech101 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Example #12
0
def stl10(root):
    """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10
    
    The STL-10 dataset is an image recognition dataset for developing 
    unsupervised feature learning, deep learning, self-taught learning algorithms.
    It is inspired by the CIFAR-10 dataset but with some modifications. 
    In particular, each class has fewer labeled training examples than in CIFAR-10, 
    but a very large set of unlabeled examples is provided to learn image models 
    prior to supervised training. The primary challenge is to make use of the 
    unlabeled data (which comes from a similar but different 
    distribution from the labeled data) to build a useful prior. 
    We also expect that the higher resolution of this dataset (96x96) 
    will make it a challenging benchmark for developing 
    more scalable unsupervised learning methods.
    
    Attention: if exist dirs `root/stl10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    stl10 data: 
    `root/stl10/train/1/xx.png`
    `root/stl10/train/4/xx.png`
    `root/stl10/train/8/xx.png`
    `root/stl10/test/1/xx.png`
    `root/stl10/test/4/xx.png`
    `root/stl10/test/8/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/stl10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/stl10`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'stl10')
    url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz"
    rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    un_tar(un_gz(gfile.path_join(task_path, url.split('/')[-1])))
    
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_y.bin'), 'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx]))
    
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_y.bin'), 'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx]))

    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/unlabeled_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    gfile.makedirs(gfile.path_join(task_path, 'unlabeled'))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'unlabeled', str(idx)+'.png'), array_to_image(data[idx]))
    
    gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar.gz'))
    gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar'))
    gfile.remove(path_join(task_path, 'stl10_binary'))
    print('stl10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path