def mnist_kuzushiji_kanji(root): """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist. Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset of 3832 Kanji characters, containing 140,426 images of both common and rare characters. Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji_kanji data: `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png` `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png` `root/mnist_kuzushiji_kanji/train/U+9593/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False) url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar" rq.files(url, gfile.path_join(root, url.split('/')[-1])) un_tar(gfile.path_join(root, url.split('/')[-1]), task_path) gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train')) gfile.remove(gfile.path_join(root, 'kkanji.tar')) print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def caltech101(root): """Caltech101 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech101 Pictures of objects belonging to 101 categories. About 40 to 800 images per category. Most categories have about 50 images. Collected in September 2003 by Fei-Fei Li, Marco Andreetto, and Marc 'Aurelio Ranzato. The size of each image is roughly 300 x 200 pixels. We have carefully clicked outlines of each object in these pictures, these are included under the 'Annotations.tar'. There is also a matlab script to view the annotaitons, 'show_annotations.m'. Attention: if exist dirs `root/caltech101`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` caltech101 data: `root/caltech101/train/accordion/xx.jpg` `root/caltech101/train/brain/xx.ipg` `root/caltech101/train/panda/xx.jpg` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/caltech101`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/caltech101. """ start = time.time() assert tf.gfile.IsDirectory(root), '`root` should be directory.' task_path = os.path.join(root, 'caltech101') if tf.gfile.Exists(task_path): tf.gfile.DeleteRecursively(task_path) url = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz' tf.keras.utils.get_file(os.path.join(root, url.split('/')[-1]), url) un_tar(un_gz(os.path.join(root, url.split('/')[-1])), task_path) tf.gfile.Rename(os.path.join(task_path, '101_ObjectCategories'), os.path.join(task_path, 'train')) for i in ['101_ObjectCategories.tar.gz', '101_ObjectCategories.tar']: tf.gfile.Remove(os.path.join(root, i)) print('caltech101 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def caltech256(root): """Caltech256 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech256 Pictures of objects belonging to 256 categories. About 80 to 800 images per category. Collected in September 2003 by Fei-Fei Li, Marco Andreetto, and Marc 'Aurelio Ranzato. The size of each image is roughly 300 x 200 pixels. We have carefully clicked outlines of each object in these pictures, these are included under the 'Annotations.tar'. There is also a matlab script to view the annotaitons, 'show_annotations.m'. Attention: if exist dirs `root/caltech256`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` caltech256 data: `root/caltech256/train/007.bat/xx.jpg` `root/caltech256/train/010.beer-mug/xx.ipg` `root/caltech256/train/064.elephant-101/xx.jpg` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/caltech256`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/caltech256`. """ start = time.time() task_path = assert_dirs(root, 'caltech256', make_root_dir=False) url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar" rq.files(url, gfile.path_join(root, url.split('/')[-1])) un_tar(gfile.path_join(root, url.split('/')[-1]), task_path) gfile.rename(gfile.path_join(task_path, '256_ObjectCategories'), gfile.path_join(task_path, 'train')) gfile.remove(gfile.path_join(root, '256_ObjectCategories.tar')) print('caltech256 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def mnist_kuzushiji_kanji(root): """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist. Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset of 3832 Kanji characters, containing 140,426 images of both common and rare characters. Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji_kanji data: `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png` `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png` `root/mnist_kuzushiji_kanji/train/U+9593/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji. """ start = time.time() assert tf.gfile.IsDirectory(root), '`root` should be directory.' task_path = os.path.join(root, 'mnist_kuzushiji_kanji') if tf.gfile.Exists(task_path): tf.gfile.DeleteRecursively(task_path) url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar" tf.keras.utils.get_file(os.path.join(root, url.split('/')[-1]), url) un_tar(os.path.join(root, url.split('/')[-1]), task_path) tf.gfile.Rename(os.path.join(task_path, 'kkanji2'), os.path.join(task_path, 'train')) tf.gfile.Remove(os.path.join(root, 'kkanji.tar')) print( 'mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def stl10(root): """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10 The STL-10 dataset is an image recognition dataset for developing unsupervised feature learning, deep learning, self-taught learning algorithms. It is inspired by the CIFAR-10 dataset but with some modifications. In particular, each class has fewer labeled training examples than in CIFAR-10, but a very large set of unlabeled examples is provided to learn image models prior to supervised training. The primary challenge is to make use of the unlabeled data (which comes from a similar but different distribution from the labeled data) to build a useful prior. We also expect that the higher resolution of this dataset (96x96) will make it a challenging benchmark for developing more scalable unsupervised learning methods. Attention: if exist dirs `root/stl10`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` stl10 data: `root/stl10/train/1/xx.png` `root/stl10/train/4/xx.png` `root/stl10/train/8/xx.png` `root/stl10/test/1/xx.png` `root/stl10/test/4/xx.png` `root/stl10/test/8/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/stl10`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/stl10`. """ start = time.time() assert tf.gfile.IsDirectory(root), '`root` should be directory.' task_path = os.path.join(root, 'stl10') if tf.gfile.Exists(task_path): tf.gfile.DeleteRecursively(task_path) tf.gfile.MakeDirs(task_path) url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz" tf.keras.utils.get_file(os.path.join(task_path, url.split('/')[-1]), url) un_tar(un_gz(os.path.join(task_path, url.split('/')[-1]))) with tf.gfile.GFile( os.path.join(task_path, 'stl10_binary/stl10_binary/test_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3, 96, 96).transpose( (0, 3, 2, 1)) with tf.gfile.GFile( os.path.join(task_path, 'stl10_binary/stl10_binary/test_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): tf.gfile.MakeDirs(os.path.join(task_path, 'test', str(i))) for idx in range(data.shape[0]): imageio.imsave( os.path.join(task_path, 'test', str(data_label[idx]), str(idx) + '.png'), data[idx]) with tf.gfile.GFile( os.path.join(task_path, 'stl10_binary/stl10_binary/train_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3, 96, 96).transpose( (0, 3, 2, 1)) with tf.gfile.GFile( os.path.join(task_path, 'stl10_binary/stl10_binary/train_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): tf.gfile.MakeDirs(os.path.join(task_path, 'train', str(i))) for idx in range(data.shape[0]): imageio.imsave( os.path.join(task_path, 'train', str(data_label[idx]), str(idx) + '.png'), data[idx]) with tf.gfile.GFile( os.path.join(task_path, 'stl10_binary/stl10_binary/unlabeled_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3, 96, 96).transpose( (0, 3, 2, 1)) tf.gfile.MakeDirs(os.path.join(task_path, 'unlabeled')) for idx in range(data.shape[0]): imageio.imsave(os.path.join(task_path, 'unlabeled', str(idx) + '.png'), data[idx]) tf.gfile.Remove(os.path.join(task_path, 'stl10_binary.tar.gz')) tf.gfile.Remove(os.path.join(task_path, 'stl10_binary.tar')) tf.gfile.DeleteRecursively(os.path.join(task_path, 'stl10_binary')) print('stl10 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path