def huajianji(root): """Huajianji dataset from Chinese classical literature. "Hua Jian Ji" is a collection of Chinese poetry compiled during the Five Dynasties and Ten Kingdoms period. It is also the first collection of literati in the history of literature. It was edited by Zhao Chongxi, a later monk. The book contains 18 classic works of poetry by Wen Tingjun and Wei Zhuang. It concentrates and typically reflects the subject orientation, aesthetic taste, physical style and artistic achievement of the human creation in the early word history. Data storage directory: root = `/user/.../mydata` huajianji data: `root/huajianji/huajianji.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/huajianji`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/huajianji`. """ start = time.time() task_path = assert_dirs(root, 'huajianji') url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/huajianji.json" rq.files(url, path_join(task_path, url.split('/')[-1])) print('huajianji dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def ci_song(root): """Song_ci dataset from Chinese classical literature. "The Song of the Whole Song" is one of the most important achievements of ancient books in China in the past 100 years. Song poetry and Tang poetry are the artistic peaks of Chinese classical poetry. The "Full Tang Poetry" edited in the Qing Dynasty is a household name, and now it is newly compiled "Full Song Ci", which is called the double shackles of Chinese literature. The book has a total of five volumes, a collection of words from the Song Dynasty for three hundred years. Data storage directory: root = `/user/.../mydata` ci_song data: `root/ci_song/ci_song.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/ci_song`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/ci_song`. """ start = time.time() task_path = assert_dirs(root, 'ci_song') url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/ci_song.json' rq.files(url, path_join(task_path, url.split('/')[-1])) print('ci_song dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def mnist_kuzushiji_kanji(root): """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist. Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset of 3832 Kanji characters, containing 140,426 images of both common and rare characters. Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji_kanji data: `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png` `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png` `root/mnist_kuzushiji_kanji/train/U+9593/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False) url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar" rq.files(url, gfile.path_join(root, url.split('/')[-1])) un_tar(gfile.path_join(root, url.split('/')[-1]), task_path) gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train')) gfile.remove(gfile.path_join(root, 'kkanji.tar')) print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def poet_tang(root): """Tang_poet dataset from Chinese classical literature. "Full Tang Poetry" is the 44th year of Qing Emperor Kangxi (1705), Peng Dingqiu, Shen Sanzeng, Yang Zhongna, Wang Shizhen, Wang Wei, Yu Mei, Xu Shuben, Che Dingjin, Pan Conglu, and Cha Yu "There are more than 48,900 poems, more than 2,200 people," a total of 900 volumes, 12 volumes of catalogues. Data storage directory: root = `/user/.../mydata` poet_tang data: `root/poet_tang/poet_tang.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/poet_tang`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/poet_tang`. """ start = time.time() task_path = assert_dirs(root, 'poet_tang') url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_tang.json.bz2' rq.files(url, path_join(task_path, 'poet_tang.json.bz2')) un_bz2(path_join(task_path, 'poet_tang.json.bz2')) remove(path_join(task_path, 'poet_tang.json.bz2')) print('poet_tang dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def poet_song(root): """Song_poet dataset from Chinese classical literature. "Full Song Poetry" After the high prosperity of Tang poetry, Song poetry has new development and creation in ideological content and artistic expression. Many excellent writers have appeared, and many schools have been formed, which have produced poetry development in Yuan, Ming and Qing. A far-reaching impact. Data storage directory: root = `/user/.../mydata` poetry_song data: `root/poet_song/poet_song.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/poet_song`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/poet_song`. """ start = time.time() task_path = assert_dirs(root, 'poet_song') url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_song.json.bz2' rq.files(url, path_join(task_path, 'poet_song.json.bz2')) un_bz2(path_join(task_path, 'poet_song.json.bz2')) remove(path_join(task_path, 'poet_song.json.bz2')) print('poet_song dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def arrhythmia(root): """Aarrhythmia datasets from http://archive.ics.uci.edu/ml/datasets/Arrhythmia. This database contains 279 attributes, 206 of which are linear valued and the rest are nominal. Data storage directory: root = `/user/.../mydata` arrhythmia data: `root/abalone/arrhythmia.txt` `root/abalone/introduce.txt` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/arrhythmia`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/arrhythmia`. """ start = time.time() task_path = assert_dirs(root, 'arrhythmia') url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.names' url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data' rq.files(url_introduce, gfile.path_join(task_path, 'introduce.txt'), verbose=0) rq.table(url_txt, gfile.path_join(task_path, 'arrhythmia.txt')) print('arrhythmia dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def shijing(root): """Shijing dataset from Chinese classical literature. The earliest poetry collection in China, The Book of Songs, is the earliest collection of poems in ancient Chinese poetry. It collects poems from the early Western Zhou Dynasty to the middle of spring and autumn (the first 11th century to the first six centuries), including 311 articles, of which 6 The article is a poem, that is, only the title, no content, called the six poems of the poems (Nan, Baihua, Huaying, Yukang, Chongwu, Yuyi), reflecting the period from the beginning of the week to the late Zhou Dynasty for about five hundred years. Data storage directory: root = `/user/.../mydata` shijing data: `root/shijing/shijing.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/shijing`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/shijing`. """ start = time.time() task_path = assert_dirs(root, 'shijing') url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/shijing.json' rq.files(url, path_join(task_path, url.split('/')[-1])) print('shijing dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def poetry_SouthernTang(root): """poetry_SouthernTang dataset from Chinese classical literature. "The Southern Tang Dynasty's two main words", is the Southern Tang Dynasty master Li Jing, the latter master Li Yu. The book was written in the Southern Song Dynasty, and later generations have been compiled, and later generations have written various versions. Data storage directory: root = `/user/.../mydata` poetry_SouthernTang data: `root/poetry_SouthernTang/poetry_SouthernTang.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/poetry_SouthernTang`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/poetry_SouthernTang`. """ start = time.time() task_path = assert_dirs(root, 'poetry_SouthernTang') url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/nantang_erzhu_poetry.json" rq.files(url, path_join(task_path, 'poetry_SouthernTang.json')) print( 'poetry_SouthernTang dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def economist(root, date, mode='pdf'): """The Economist from https://github.com/nailperry-zd/The-Economist. Data storage directory: root = `/user/.../mydata` economist data: `root/...(pdf or epub or mobi)` Args: root: str, Store the absolute path of the data directory. date: str, eg:'2019-01-01'. mode: str, one of ['pdf', 'epub', 'mobi']. Returns: Store the absolute path of the data directory, is `root/...(pdf or epub or mobi)`. """ start = time.time() assert mode in ['pdf', 'epub', 'mobi'], "`mode` should be one of ['pdf', 'epub', 'mobi']." t = divmod((pd.to_datetime(date) - pd.to_datetime('2017-05-06')).days, 7) if t[0] < 0 or t[1] > 0: raise ValueError("No book that meets the date.") task_path = assert_dirs(root) t = 'https://github.com/nailperry-zd/The-Economist/raw/master/{}'.format( date) t = [ i for i in requests.get(t).content.decode('utf-8').split('\n') if ('The-Economist/blob/master/{}'.format(date) in i) & (mode in i) ] url = 'https://github.com/nailperry-zd/The-Economist/raw/master/' + date + '/' + [ i[7:-1] for i in t[0].split(' ') if 'title' in i ][0] task_path = path_join(task_path, url.split('/')[-1]) rq.files(url, task_path, verbose=1) print('economist dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def arxiv(root, ids, new_name=None): """Download paper from https://arxiv.org, the file format is pdf. Data storage directory: root = `/user/.../mydata` `ids`.pdf data: `root/arxiv/`ids`.pdf` or `root/arxiv/`new_name`.pdf` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/arxiv`, root should be `/user/.../mydata`. ids: str, arxiv paper id. example:ids = '1605.09782' mean you want get paper links https://arxiv.org/abs/1605.09782. new_name: str, default None. if not None, download file path is `root/arxiv/new_name.pdf`. Returns: Store the absolute path of the data directory, is `root/arxiv`. """ start = time.time() assert gfile.isdir(root), '`root` should be directory.' assert isinstance(ids, str), '`ids` type should be str.' if new_name is None: task_path = gfile.path_join(root, 'arxiv', ids + '.pdf') else: task_path = gfile.path_join(root, 'arxiv', new_name + '.pdf') gfile.makedirs(gfile.path_join(root, 'arxiv')) gfile.remove(task_path) url = 'https://arxiv.org/pdf/' + str(ids) + '.pdf' rq.files(url, task_path) print('arxiv paper download completed, run time %d min %.2f sec' % divmod( (time.time() - start), 60)) return task_path
def mnist_fashion(root): """A dataset of Zalando's article images consisting of fashion products. Fashion mnist datasets is a drop-in replacement of the original MNIST dataset from https://github.com/zalandoresearch/fashion-mnist. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_fashion`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_fashion data: `root/mnist_fashion/train/0/xx.png` `root/mnist_fashion/train/2/xx.png` `root/mnist_fashion/train/6/xx.png` `root/mnist_fashion/test/0/xx.png` `root/mnist_fashion/test/2/xx.png` `root/mnist_fashion/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_fashion`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_fashion`. """ start = time.time() task_path = assert_dirs(root, 'mnist_fashion') url_list = ['http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz', 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath: train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath: train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28) with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath: test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath: test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_fashion dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def cifar100(root, fine_label=True): """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html Each sample is an image (in 3D NDArray) with shape (32, 32, 3). Attention: if exist dirs `root/cifar100`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` cifar100 data: `root/cifar100/train/0/xx.png` `root/cifar100/train/2/xx.png` `root/cifar100/train/6/xx.png` `root/cifar100/test/0/xx.png` `root/cifar100/test/2/xx.png` `root/cifar100/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/cifar100`, root should be `/user/.../mydata`. fine_label: bool, default False. Whether to load the fine-grained (100 classes) or coarse-grained (20 super-classes) labels. Returns: Store the absolute path of the data directory, is `root/cifar100`. """ start = time.time() task_path = assert_dirs(root, 'cifar100') url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar100/cifar-100-binary.tar.gz' rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with tarfile.open(gfile.path_join(task_path, url.split('/')[-1])) as t: t.extractall(task_path) noise_flie = gfile.listdir(task_path) with open(gfile.path_join(task_path, 'train.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2) train = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1) train_label = data[:, 0 + fine_label].astype(np.int32) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for idx in range(train.shape[0]): save_image( gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx) + '.png'), array_to_image(train[idx])) with open(gfile.path_join(task_path, 'test.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2) test = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1) test_label = data[:, 0 + fine_label].astype(np.int32) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(test.shape[0]): save_image( gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx) + '.png'), array_to_image(test[idx])) for file in noise_flie: gfile.remove(gfile.path_join(task_path, file)) print('cifar100 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def mnist(root): """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist data: `root/mnist/train/0/xx.png` `root/mnist/train/2/xx.png` `root/mnist/train/6/xx.png` `root/mnist/test/0/xx.png` `root/mnist/test/2/xx.png` `root/mnist/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist`. """ start = time.time() task_path = assert_dirs(root, 'mnist') url_list = ['https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz', 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath: train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath: train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28) with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath: test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath: test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28) for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_kuzushiji10(root): """Kuzushiji-MNIST from https://github.com/rois-codh/kmnist. Kuzushiji-MNIST is a drop-in replacement for the MNIST dataset (28x28 grayscale, 70,000 images), provided in the original MNIST format as well as a NumPy format. Since MNIST restricts us to 10 classes, we chose one character to represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kuzushiji10`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji10 data: `root/mnist_kuzushiji10/train/0/xx.png` `root/mnist_kuzushiji10/train/2/xx.png` `root/mnist_kuzushiji10/train/6/xx.png` `root/mnist_kuzushiji10/test/0/xx.png` `root/mnist_kuzushiji10/test/2/xx.png` `root/mnist_kuzushiji10/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji10`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji10`. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji10') url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) train = np.load(gfile.path_join(task_path, 'kmnist-train-imgs.npz'))['arr_0'] train_label = np.load(gfile.path_join(task_path, 'kmnist-train-labels.npz'))['arr_0'] test = np.load(gfile.path_join(task_path, 'kmnist-test-imgs.npz'))['arr_0'] test_label = np.load(gfile.path_join(task_path, 'kmnist-test-labels.npz'))['arr_0'] for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_kuzushiji10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def mnist_kuzushiji49(root): """Kuzushiji-49 from https://github.com/rois-codh/kmnist. Kuzushiji-49, as the name suggests, has 49 classes (28x28 grayscale, 270,912 images), is a much larger, but imbalanced dataset containing 48 Hiragana characters and one Hiragana iteration mark. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kuzushiji49`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kuzushiji49 data: `root/mnist_kuzushiji49/train/0/xx.png` `root/mnist_kuzushiji49/train/2/xx.png` `root/mnist_kuzushiji49/train/6/xx.png` `root/mnist_kuzushiji49/test/0/xx.png` `root/mnist_kuzushiji49/test/2/xx.png` `root/mnist_kuzushiji49/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kuzushiji49`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kuzushiji49`. """ start = time.time() task_path = assert_dirs(root, 'mnist_kuzushiji49') url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz', 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz'] for url in url_list: rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) train = np.load(gfile.path_join(task_path, 'k49-train-imgs.npz'))['arr_0'] train_label = np.load(gfile.path_join(task_path, 'k49-train-labels.npz'))['arr_0'] test = np.load(gfile.path_join(task_path, 'k49-test-imgs.npz'))['arr_0'] test_label = np.load(gfile.path_join(task_path, 'k49-test-labels.npz'))['arr_0'] for i in set(train_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for i in set(test_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(train.shape[0]): save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), array_to_image(train[idx].reshape(28, 28, 1))) for idx in range(test.shape[0]): save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), array_to_image(test[idx].reshape(28, 28, 1))) for url in url_list: gfile.remove(gfile.path_join(task_path, url.split('/')[-1])) print('mnist_kuzushiji49 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def wine(root): """Title of Database: Wine recognition data Updated Sept 21, 1998 by C.Blake : Added attribute information These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines. Number of Instances class 1 59 class 2 71 class 3 48 Data storage directory: root = `/user/.../mydata` wine data: `root/wine/wine.txt` `root/wine/wine.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/wine`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/wine`. """ start = time.time() task_path = assert_dirs(root, 'wine') url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names' url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' rq.files(url_introduce, gfile.path_join(task_path, 'introduce.txt'), verbose=0) rq.table(url_txt, gfile.path_join(task_path, 'wine.txt'), names=[ 'label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline' ]) print('wine dataset download completed, run time %d min %.2f sec' % divmod( (time.time() - start), 60)) return task_path
def coil100(root): """COIL100 dataset from http://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php "Columbia Object Image Library (COIL-100)," S. A. Nene, S. K. Nayar and H. Murase, Technical Report CUCS-006-96, February 1996. Each sample is an gray image (in 3D NDArray) with shape (128, 128, 1). Attention: if exist dirs `root/coil100`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` coil100 data: `root/coil100/train/0/xx.png` `root/coil100/train/2/xx.png` `root/coil100/train/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/coil100`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/coil100`. """ start = time.time() task_path = assert_dirs(root, 'coil100') url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip" rq.files(url, gfile.path_join(task_path, 'coil100.zip')) un_zip(gfile.path_join(task_path, 'coil100.zip')) image = gfile.listdir(gfile.path_join(task_path, 'coil100', 'coil-100')) t = pd.DataFrame(image, columns=['image']) t['label'] = t.image.map(lambda x:x.split('__')[0][3:]) t['image_old_path'] = t.image.map(lambda x:gfile.path_join(task_path, 'coil100', 'coil-100', x)) t['image_new_path'] = (t.label+'/'+t.image).map(lambda x:gfile.path_join(task_path, 'train', x)) for i in t.label.unique(): gfile.makedirs(gfile.path_join(task_path, 'train', i)) for i,j in zip(t.image_old_path, t.image_new_path): gfile.copy(i, j) gfile.remove(gfile.path_join(task_path, 'coil100.zip')) gfile.remove(gfile.path_join(task_path, 'coil100')) gfile.remove(gfile.path_join(task_path, 'train', 'vertGroupppm2png.pl')) print('coil100 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def caltech101(root): """Caltech101 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech101 Pictures of objects belonging to 101 categories. About 40 to 800 images per category. Most categories have about 50 images. Collected in September 2003 by Fei-Fei Li, Marco Andreetto, and Marc 'Aurelio Ranzato. The size of each image is roughly 300 x 200 pixels. We have carefully clicked outlines of each object in these pictures, these are included under the 'Annotations.tar'. There is also a matlab script to view the annotaitons, 'show_annotations.m'. Attention: if exist dirs `root/caltech101`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` caltech101 data: `root/caltech101/train/accordion/xx.jpg` `root/caltech101/train/brain/xx.ipg` `root/caltech101/train/panda/xx.jpg` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/caltech101`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/caltech101. """ start = time.time() task_path = assert_dirs(root, 'caltech101', make_root_dir=False) url = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz' rq.files(url, gfile.path_join(root, url.split('/')[-1])) un_tar(un_gz(gfile.path_join(root, url.split('/')[-1])), task_path) gfile.rename(gfile.path_join(task_path, '101_ObjectCategories'), gfile.path_join(task_path, 'train')) for i in ['101_ObjectCategories.tar.gz', '101_ObjectCategories.tar']: gfile.remove(gfile.path_join(root, i)) print('caltech101 dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def abbreviation(root): """Chinese abbreviation datasets. datasets url:`https://github.com/zhangyics/Chinese-abbreviation-dataset` A corpus of Chinese abbreviation This is the dataset released by the paper "A Chinese Dataset with Negative Full Forms for General Abbreviation Prediction". Data storage directory: root = `/user/.../mydata` Chinese abbreviation datasets data: `root/chinese_abbreviation/train_set.txt` `root/chinese_abbreviation/test_set.txt` `root/chinese_abbreviation/dev_set.txt` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/chinese_abbreviation`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/chinese_abbreviation`. """ start = time.time() task_path = assert_dirs(root, 'chinese_abbreviation') url_train = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/train_set.txt" url_test = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/test_set.txt" url_dev = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/dev_set.txt" rq.files(url_train, path_join(task_path, 'train_set.txt')) rq.files(url_test, path_join(task_path, 'test_set.txt')) rq.files(url_dev, path_join(task_path, 'dev_set.txt')) print( 'chinese abbreviation dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def abalone(root): """Predicting the age of abalone from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task. Other measurements, which are easier to obtain, are used to predict the age. Further information, such as weather patterns and location (hence food availability) may be required to solve the problem. Data storage directory: root = `/user/.../mydata` abalone data: `root/abalone/abalone.txt` `root/abalone/introduce.txt` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/abalone`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/abalone`. """ start = time.time() task_path = assert_dirs(root, 'abalone') url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names' url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' rq.files(url_introduce, gfile.path_join(task_path, 'introduce.txt'), verbose=0) rq.table(url_txt, gfile.path_join(task_path, 'abalone.txt'), names=[ 'Sex', 'Length', 'Diameter', 'Height' 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'label' ]) print('abalone dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def lunyu(root): """Lunyu dataset from Chinese classical literature. The Chinese Confucian classics, "The Analects of Confucius" is a collection of quotations of Confucius and his disciples. It was written by Confucius disciples and re-transmission disciples, and was written in the early period of the Warring States Period. The book consists of 20 chapters and 492 chapters. It is mainly composed of quotations and supplemented by narratives. It mainly records the words and deeds of Confucius and his disciples, and more concentratedly reflects Confucius' political opinions, ethical thoughts, moral concepts and educational principles. This book is one of the classic works of Confucianism. It is also called "Four Books" with "University", "The Doctrine of the Mean" and "Mencius", plus "The Book of Songs", "Shangshu", "Book of Rites", "Zhou Yi", "Spring and Autumn", collectively called "four books". Five Classics." Data storage directory: root = `/user/.../mydata` lunyu data: `root/lunyu/lunyu.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/lunyu`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/lunyu`. """ start = time.time() task_path = assert_dirs(root, 'lunyu') url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/lunyu.json' rq.files(url, path_join(task_path, url.split('/')[-1])) print('lunyu dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def youmengying(root): """Youmengying dataset from Chinese classical literature. "You Meng Ying" is an anthology of Zhang Chao's creations by Qing Dynasty writers. Data storage directory: root = `/user/.../mydata` youmengying data: `root/youmengying/youmengying.json` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/youmengying`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/youmengying`. """ start = time.time() task_path = assert_dirs(root, 'youmengying') url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/youmengying.json" rq.files(url, path_join(task_path, url.split('/')[-1])) print('youmengying dataset download completed, run time %d min %.2f sec' % divmod((time.time() - start), 60)) return task_path
def mnist_kannada(root): """kannada-MNIST from https://github.com/vinayprabhu/Kannada_MNIST. The Kannada-MNIST dataset was created an a drop-in substitute for the standard MNIST dataset. Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1). Attention: if exist dirs `root/mnist_kannada`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` mnist_kannada data: `root/mnist_kannada/train/0/xx.png` `root/mnist_kannada/train/2/xx.png` `root/mnist_kannada/train/6/xx.png` `root/mnist_kannada/test/0/xx.png` `root/mnist_kannada/test/2/xx.png` `root/mnist_kannada/test/6/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/mnist_kannada`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/mnist_kannada`. """ start = time.time() print('Downloading data from https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip') task_path = assert_dirs(root, 'mnist_kannada') zip_path = rq.files('https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip', task_path+'/kannada_MNIST.zip') unzip_path = un_zip(task_path+'/kannada_MNIST.zip') train = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_train.csv'), header=None, dtype='uint8') test = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_test.csv'), header=None, dtype='uint8') for i in set(train[0]): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for i in range(len(train)): save_image(gfile.path_join(task_path, 'train', str(train.iat[i, 0]), str(i)+'.png'), array_to_image(train.iloc[i, 1:].values.reshape(28, 28, 1))) for i in range(len(test)): save_image(gfile.path_join(task_path, 'test', str(test.iat[i, 0]), str(i)+'.png'), array_to_image(test.iloc[i, 1:].values.reshape(28, 28, 1))) gfile.remove(zip_path) gfile.remove(unzip_path) print('mnist_kannada dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path
def _download(path): rq.files( path.split('|')[1], gfile.path_join(path.split('|')[0], path.split('|')[1].split('/')[-1]))
def stl10(root): """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10 The STL-10 dataset is an image recognition dataset for developing unsupervised feature learning, deep learning, self-taught learning algorithms. It is inspired by the CIFAR-10 dataset but with some modifications. In particular, each class has fewer labeled training examples than in CIFAR-10, but a very large set of unlabeled examples is provided to learn image models prior to supervised training. The primary challenge is to make use of the unlabeled data (which comes from a similar but different distribution from the labeled data) to build a useful prior. We also expect that the higher resolution of this dataset (96x96) will make it a challenging benchmark for developing more scalable unsupervised learning methods. Attention: if exist dirs `root/stl10`, api will delete it and create it. Data storage directory: root = `/user/.../mydata` stl10 data: `root/stl10/train/1/xx.png` `root/stl10/train/4/xx.png` `root/stl10/train/8/xx.png` `root/stl10/test/1/xx.png` `root/stl10/test/4/xx.png` `root/stl10/test/8/xx.png` Args: root: str, Store the absolute path of the data directory. example:if you want data path is `/user/.../mydata/stl10`, root should be `/user/.../mydata`. Returns: Store the absolute path of the data directory, is `root/stl10`. """ start = time.time() task_path = assert_dirs(root, 'stl10') url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz" rq.files(url, gfile.path_join(task_path, url.split('/')[-1])) un_tar(un_gz(gfile.path_join(task_path, url.split('/')[-1]))) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): gfile.makedirs(gfile.path_join(task_path, 'test', str(i))) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'test', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx])) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_y.bin'), 'rb') as fin: data_label = np.frombuffer(fin.read(), dtype=np.uint8) for i in set(data_label): gfile.makedirs(gfile.path_join(task_path, 'train', str(i))) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'train', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx])) with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/unlabeled_X.bin'), 'rb') as fin: data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1)) gfile.makedirs(gfile.path_join(task_path, 'unlabeled')) for idx in range(data.shape[0]): save_image(gfile.path_join(task_path, 'unlabeled', str(idx)+'.png'), array_to_image(data[idx])) gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar.gz')) gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar')) gfile.remove(path_join(task_path, 'stl10_binary')) print('stl10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60)) return task_path