Example #1
0
def mnist_kuzushiji_kanji(root):
    """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset 
    of 3832 Kanji characters, containing 140,426 images 
    of both common and rare characters.
    
    Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji_kanji data: 
    `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+9593/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False)
    url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar"
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(gfile.path_join(root, url.split('/')[-1]), task_path)
    gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train'))
    gfile.remove(gfile.path_join(root, 'kkanji.tar'))
    print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Example #2
0
def caltech101(root):
    """Caltech101 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech101
    
    Pictures of objects belonging to 101 categories. 
    About 40 to 800 images per category.
    Most categories have about 50 images. 
    Collected in September 2003 by Fei-Fei Li, Marco Andreetto, 
    and Marc 'Aurelio Ranzato.  
    The size of each image is roughly 300 x 200 pixels.

    We have carefully clicked outlines of each object in these pictures, 
    these are included under the 'Annotations.tar'.
    There is also a matlab script to view the annotaitons, 'show_annotations.m'.
    
    Attention: if exist dirs `root/caltech101`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    caltech101 data: 
    `root/caltech101/train/accordion/xx.jpg`
    `root/caltech101/train/brain/xx.ipg`
    `root/caltech101/train/panda/xx.jpg`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/caltech101`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/caltech101.
    """
    start = time.time()
    assert tf.gfile.IsDirectory(root), '`root` should be directory.'
    task_path = os.path.join(root, 'caltech101')
    if tf.gfile.Exists(task_path):
        tf.gfile.DeleteRecursively(task_path)
    url = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz'
    tf.keras.utils.get_file(os.path.join(root, url.split('/')[-1]), url)
    un_tar(un_gz(os.path.join(root, url.split('/')[-1])), task_path)
    tf.gfile.Rename(os.path.join(task_path, '101_ObjectCategories'),
                    os.path.join(task_path, 'train'))
    for i in ['101_ObjectCategories.tar.gz', '101_ObjectCategories.tar']:
        tf.gfile.Remove(os.path.join(root, i))
    print('caltech101 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Example #3
0
def caltech256(root):
    """Caltech256 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech256
    
    Pictures of objects belonging to 256 categories. 
    About 80 to 800 images per category.
    Collected in September 2003 by Fei-Fei Li, Marco Andreetto, 
    and Marc 'Aurelio Ranzato.  
    The size of each image is roughly 300 x 200 pixels.

    We have carefully clicked outlines of each object in these pictures, 
    these are included under the 'Annotations.tar'.
    There is also a matlab script to view the annotaitons, 'show_annotations.m'.
    
    Attention: if exist dirs `root/caltech256`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    caltech256 data: 
    `root/caltech256/train/007.bat/xx.jpg`
    `root/caltech256/train/010.beer-mug/xx.ipg`
    `root/caltech256/train/064.elephant-101/xx.jpg`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/caltech256`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/caltech256`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'caltech256', make_root_dir=False)
    url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar"
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(gfile.path_join(root, url.split('/')[-1]), task_path)
    gfile.rename(gfile.path_join(task_path, '256_ObjectCategories'),
                 gfile.path_join(task_path, 'train'))
    gfile.remove(gfile.path_join(root, '256_ObjectCategories.tar'))
    print('caltech256 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Example #4
0
def mnist_kuzushiji_kanji(root):
    """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset 
    of 3832 Kanji characters, containing 140,426 images 
    of both common and rare characters.
    
    Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji_kanji data: 
    `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+9593/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji.
    """
    start = time.time()
    assert tf.gfile.IsDirectory(root), '`root` should be directory.'
    task_path = os.path.join(root, 'mnist_kuzushiji_kanji')
    if tf.gfile.Exists(task_path):
        tf.gfile.DeleteRecursively(task_path)
    url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar"
    tf.keras.utils.get_file(os.path.join(root, url.split('/')[-1]), url)
    un_tar(os.path.join(root, url.split('/')[-1]), task_path)
    tf.gfile.Rename(os.path.join(task_path, 'kkanji2'),
                    os.path.join(task_path, 'train'))
    tf.gfile.Remove(os.path.join(root, 'kkanji.tar'))
    print(
        'mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Example #5
0
def stl10(root):
    """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10
    
    The STL-10 dataset is an image recognition dataset for developing 
    unsupervised feature learning, deep learning, self-taught learning algorithms.
    It is inspired by the CIFAR-10 dataset but with some modifications. 
    In particular, each class has fewer labeled training examples than in CIFAR-10, 
    but a very large set of unlabeled examples is provided to learn image models 
    prior to supervised training. The primary challenge is to make use of the 
    unlabeled data (which comes from a similar but different 
    distribution from the labeled data) to build a useful prior. 
    We also expect that the higher resolution of this dataset (96x96) 
    will make it a challenging benchmark for developing 
    more scalable unsupervised learning methods.
    
    Attention: if exist dirs `root/stl10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    stl10 data: 
    `root/stl10/train/1/xx.png`
    `root/stl10/train/4/xx.png`
    `root/stl10/train/8/xx.png`
    `root/stl10/test/1/xx.png`
    `root/stl10/test/4/xx.png`
    `root/stl10/test/8/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/stl10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/stl10`.
    """
    start = time.time()
    assert tf.gfile.IsDirectory(root), '`root` should be directory.'
    task_path = os.path.join(root, 'stl10')
    if tf.gfile.Exists(task_path):
        tf.gfile.DeleteRecursively(task_path)
    tf.gfile.MakeDirs(task_path)
    url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz"
    tf.keras.utils.get_file(os.path.join(task_path, url.split('/')[-1]), url)
    un_tar(un_gz(os.path.join(task_path, url.split('/')[-1])))

    with tf.gfile.GFile(
            os.path.join(task_path, 'stl10_binary/stl10_binary/test_X.bin'),
            'rb') as fin:
        data = np.frombuffer(fin.read(),
                             dtype=np.uint8).reshape(-1, 3, 96, 96).transpose(
                                 (0, 3, 2, 1))
    with tf.gfile.GFile(
            os.path.join(task_path, 'stl10_binary/stl10_binary/test_y.bin'),
            'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        tf.gfile.MakeDirs(os.path.join(task_path, 'test', str(i)))
    for idx in range(data.shape[0]):
        imageio.imsave(
            os.path.join(task_path, 'test', str(data_label[idx]),
                         str(idx) + '.png'), data[idx])

    with tf.gfile.GFile(
            os.path.join(task_path, 'stl10_binary/stl10_binary/train_X.bin'),
            'rb') as fin:
        data = np.frombuffer(fin.read(),
                             dtype=np.uint8).reshape(-1, 3, 96, 96).transpose(
                                 (0, 3, 2, 1))
    with tf.gfile.GFile(
            os.path.join(task_path, 'stl10_binary/stl10_binary/train_y.bin'),
            'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        tf.gfile.MakeDirs(os.path.join(task_path, 'train', str(i)))
    for idx in range(data.shape[0]):
        imageio.imsave(
            os.path.join(task_path, 'train', str(data_label[idx]),
                         str(idx) + '.png'), data[idx])

    with tf.gfile.GFile(
            os.path.join(task_path,
                         'stl10_binary/stl10_binary/unlabeled_X.bin'),
            'rb') as fin:
        data = np.frombuffer(fin.read(),
                             dtype=np.uint8).reshape(-1, 3, 96, 96).transpose(
                                 (0, 3, 2, 1))
    tf.gfile.MakeDirs(os.path.join(task_path, 'unlabeled'))
    for idx in range(data.shape[0]):
        imageio.imsave(os.path.join(task_path, 'unlabeled',
                                    str(idx) + '.png'), data[idx])

    tf.gfile.Remove(os.path.join(task_path, 'stl10_binary.tar.gz'))
    tf.gfile.Remove(os.path.join(task_path, 'stl10_binary.tar'))
    tf.gfile.DeleteRecursively(os.path.join(task_path, 'stl10_binary'))
    print('stl10 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path