def mnist_fashion(root): """A dataset of Zalando's article images consisting of fashion products. Fashion mnist datasets is a drop-in replacement of the original MNIST dataset from https://github.com/zalandoresearch/fashion-mnist. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_fashion`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_fashion data: `root/mnist_fashion/train/0/xx.png` `root/mnist_fashion/train/2/xx.png` `root/mnist_fashion/train/6/xx.png` `root/mnist_fashion/test/0/xx.png` `root/mnist_fashion/test/2/xx.png` `root/mnist_fashion/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_fashion`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_fashion`. """ start = time.time() task_path = assert_dirs(root, 'mnist_fashion') url_list = ['http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath: train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath: train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28) with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath: test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath: test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_fashion dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def cifar100(root, fine_label=True): """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html Each sample is an image (in 3D NDArray) with shape (32, 32, 3). Attention: if exist dirs `root/cifar100`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` cifar100 data: `root/cifar100/train/0/xx.png` `root/cifar100/train/2/xx.png` `root/cifar100/train/6/xx.png` `root/cifar100/test/0/xx.png` `root/cifar100/test/2/xx.png` `root/cifar100/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/cifar100`, root should be `/user/.../mydata`. fine_label: bool, default False. Whether to load the fine-grained (100 classes) or coarse-grained (20 super-classes) labels. Returns: Store the absolute path of the data directory, is `root/cifar100`. """ start = time.time() task_path = assert_dirs(root, 'cifar100') url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar100/cifar-100-binary.tar.gz' rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with tarfile.open(gfile.path_join(task_path, url.split('/')[-1])) as t: t.extractall(task_path) noise_flie = gfile.listdir(task_path) with open(gfile.path_join(task_path, 'train.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2) train = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1) train_label = data[:, 0 + fine_label].astype(np.int32) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for idx in range(train.shape[0]): save_image( gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx) + '.png'), array_to_image(train[idx])) with open(gfile.path_join(task_path, 'test.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2) test = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1) test_label = data[:, 0 + fine_label].astype(np.int32) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(test.shape[0]): save_image( gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx) + '.png'), array_to_image(test[idx])) for file in noise_flie: gfile.remove(gfile.path_join(task_path, file)) print('cifar100 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def mnist(root): """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist data: `root/mnist/train/0/xx.png` `root/mnist/train/2/xx.png` `root/mnist/train/6/xx.png` `root/mnist/test/0/xx.png` `root/mnist/test/2/xx.png` `root/mnist/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist`. """ start = time.time() task_path = assert_dirs(root, 'mnist') url_list = ['https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath: train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath: train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28) with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath: test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath: test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_kuzushiji10(root): """Kuzushiji-MNIST from https://github.com/rois-codh/kmnist. Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset (28x28 grayscale, 70,000 images), provided in the original MNIST format as well as a NumPy format. Since MNIST restricts us to 10 classes, we chose one character to represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kuzushiji10`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji10 data: `root/mnist_kuzushiji10/train/0/xx.png` `root/mnist_kuzushiji10/train/2/xx.png` `root/mnist_kuzushiji10/train/6/xx.png` `root/mnist_kuzushiji10/test/0/xx.png` `root/mnist_kuzushiji10/test/2/xx.png` `root/mnist_kuzushiji10/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji10`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji10`. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji10') url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) train = np.load(gfile.path_join(task_path, 'kmnist-train-imgs.npz'))['arr_0'] train_label = np.load(gfile.path_join(task_path, 'kmnist-train-labels.npz'))['arr_0'] test = np.load(gfile.path_join(task_path, 'kmnist-test-imgs.npz'))['arr_0'] test_label = np.load(gfile.path_join(task_path, 'kmnist-test-labels.npz'))['arr_0'] for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_kuzushiji10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_kuzushiji49(root): """Kuzushiji-49 from https://github.com/rois-codh/kmnist. Kuzushiji-49, as the name suggests, has 49 classes (28x28 grayscale, 270,912 images), is a much larger, but imbalanced dataset containing 48 Hiragana characters and one Hiragana iteration mark. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kuzushiji49`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji49 data: `root/mnist_kuzushiji49/train/0/xx.png` `root/mnist_kuzushiji49/train/2/xx.png` `root/mnist_kuzushiji49/train/6/xx.png` `root/mnist_kuzushiji49/test/0/xx.png` `root/mnist_kuzushiji49/test/2/xx.png` `root/mnist_kuzushiji49/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji49`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji49`. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji49') url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) train = np.load(gfile.path_join(task_path, 'k49-train-imgs.npz'))['arr_0'] train_label = np.load(gfile.path_join(task_path, 'k49-train-labels.npz'))['arr_0'] test = np.load(gfile.path_join(task_path, 'k49-test-imgs.npz'))['arr_0'] test_label = np.load(gfile.path_join(task_path, 'k49-test-labels.npz'))['arr_0'] for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_kuzushiji49 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_kannada(root): """kannada-MNIST from https://github.com/vinayprabhu/Kannada_MNIST. The Kannada-MNIST dataset was created an a drop-in substitute for the standard MNIST dataset. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kannada`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kannada data: `root/mnist_kannada/train/0/xx.png` `root/mnist_kannada/train/2/xx.png` `root/mnist_kannada/train/6/xx.png` `root/mnist_kannada/test/0/xx.png` `root/mnist_kannada/test/2/xx.png` `root/mnist_kannada/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kannada`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kannada`. """ start = time.time() print('Downloading data from https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip') task_path = assert_dirs(root, 'mnist_kannada') zip_path = rq.files('https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip', task_path+'/kannada_MNIST.zip') unzip_path = un_zip(task_path+'/kannada_MNIST.zip') train = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_train.csv'), header=None, dtype='uint8') test = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_test.csv'), header=None, dtype='uint8') for i in set(train[0]): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for i in range(len(train)): save_image(gfile.path_join(task_path, 'train', str(train.iat[i, 0]), str(i)+'.png'), array_to_image(train.iloc[i, 1:].values.reshape(28, 28, 1))) for i in range(len(test)): save_image(gfile.path_join(task_path, 'test', str(test.iat[i, 0]), str(i)+'.png'), array_to_image(test.iloc[i, 1:].values.reshape(28, 28, 1))) gfile.remove(zip_path) gfile.remove(unzip_path) print('mnist_kannada dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_tibetan(root): """Tibetan-MNIST from https://github.com/bat67/TibetanMNIST. Tibetan-MNIST is a drop-in replacement for the MNIST dataset (28x28 grayscale, 70,000 images), provided in the original MNIST format as well as a NumPy format. Since MNIST restricts us to 10 classes, we chose one character to represent each of the 10 rows of Hiragana when creating Tibetan-MNIST. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_tibetan`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_tibetan data: `root/mnist_tibetan/train/0/xx.png` `root/mnist_tibetan/train/2/xx.png` `root/mnist_tibetan/train/6/xx.png` `root/mnist_tibetan/test/0/xx.png` `root/mnist_tibetan/test/2/xx.png` `root/mnist_tibetan/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_tibetan`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_tibetan`. """ start = time.time() print('Downloading data from https://github.com/Hourout/datasets/tree/master/TibetanMNIST') task_path = assert_dirs(root, 'mnist_tibetan') url_list = ['https://raw.githubusercontent.com/Hourout/datasets/master/TibetanMNIST/TibetanMNIST_28_28_01.csv', 'https://raw.githubusercontent.com/Hourout/datasets/master/TibetanMNIST/TibetanMNIST_28_28_02.csv'] data = pd.DataFrame() for url in url_list: s = requests.get(url).content data = pd.concat([data, pd.read_csv(io.StringIO(s.decode('utf-8')),header=None, dtype='uint8')]) train = data.loc[:, 1:].values.reshape(-1, 28, 28) train_label = data.loc[:, 0].values for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) print('mnist_tibetan dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def stl10(root): """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10 The STL-10 dataset is an image recognition dataset for developing unsupervised feature learning, deep learning, self-taught learning algorithms. It is inspired by the CIFAR-10 dataset but with some modifications. In particular, each class has fewer labeled training examples than in CIFAR-10, but a very large set of unlabeled examples is provided to learn image models prior to supervised training. The primary challenge is to make use of the unlabeled data (which comes from a similar but different distribution from the labeled data) to build a useful prior. We also expect that the higher resolution of this dataset (96x96) will make it a challenging benchmark for developing more scalable unsupervised learning methods. Attention: if exist dirs `root/stl10`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` stl10 data: `root/stl10/train/1/xx.png` `root/stl10/train/4/xx.png` `root/stl10/train/8/xx.png` `root/stl10/test/1/xx.png` `root/stl10/test/4/xx.png` `root/stl10/test/8/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/stl10`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/stl10`. """ start = time.time() task_path = assert_dirs(root, 'stl10') url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz" rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) un_tar(un_gz(gfile.path_join(task_path, url.split('/')[-1]))) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'test', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx])) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'train', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx])) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/unlabeled_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) gfile.makedirs(gfile.path_join(task_path, 'unlabeled')) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'unlabeled', str(idx)+'.png'), array_to_image(data[idx])) gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar.gz')) gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar')) gfile.remove(path_join(task_path, 'stl10_binary')) print('stl10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path