#!/usr/bin/env python3 # This demo shows how to load libsvm file using napkinXC's load_libsvm_file function, # which is easier to use, faster, and more memory efficient than Sklearn's load_svmlight_file. # This examples requires Sklearn installed. from time import time from sklearn.datasets import load_svmlight_file from napkinxc.datasets import download_dataset, load_libsvm_file # Use download_dataset function to download one of the benchmark datasets # from XML Repository (http://manikvarma.org/downloads/XC/XMLRepository.html). download_dataset("eurlex-4k", "train") file = "data/Eurlex/eurlex_train.txt" # Load using Sklearn # Because Sklearn method cannot handle header from XML Repository, offset and number of features needs to be provided. start = time() X, Y = load_svmlight_file(file, multilabel=True, zero_based=True, n_features=5000, offset=1) print("Sklearn's load_svmlight_file time:", time() - start) # Load using napkinXC # It supports two different output formats for labels, list like in Sklearn version and sparse Scipy csr_matrix, list is default. start = time() X, Y = load_libsvm_file(file, labels_format='list') print("napkinXC's load_libsvm_file time:", time() - start)
"amazon": "Amazon-670K", "amazon-3M": "Amazon-3M", "deliciousLarge": "Delicious-200K", "eurlex": "EURLex-4K", "wiki10": "Wiki10-31K", "wikiLSHTC": "WikiLSHTC-325K", "WikipediaLarge-500K": "Wikipedia-500K", } if __name__ == "__main__": if len(sys.argv) < 2: print( "Usage: download_dataset.py [dataset name] [format (optional)] [root dir (optional)]" ) exit(1) dataset = old_aliases.get(sys.argv[1], sys.argv[1]) format = "bow" if len(sys.argv) >= 3: root = sys.argv[2] root = "data" if len(sys.argv) >= 4: root = sys.argv[3] dataset_meta = _get_data_meta(dataset, format=format) download_dataset(dataset, format=format, root=root, verbose=True) shutil.move(os.path.join(root, dataset_meta['dir']), os.path.join(root, sys.argv[1]))
def test_load_libsvm(): datasets = { "eurlex-4k": { "file": os.path.join(TEST_DATA_PATH, "Eurlex/eurlex_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 5000, "offset": 1 } }, "amazonCat-13k": { "file": os.path.join(TEST_DATA_PATH, "AmazonCat/amazonCat_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 203882, "offset": 1 } }, "amazonCat-14k": { "file": os.path.join(TEST_DATA_PATH, "AmazonCat-14K/amazonCat-14K_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 597540, "offset": 1 } }, "wiki10-31k": { "file": os.path.join(TEST_DATA_PATH, "Wiki10/wiki10_test.txt"), "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 101938, "offset": 1 } } } for d, v in datasets.items(): download_dataset(d, subset='test', format='bow', root=TEST_DATA_PATH) print("\n{} time comparison:".format(d)) t_start = time() sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"]) print( "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() - t_start)) t_start = time() nxc_X1, nxc_Y_list = load_libsvm_file(v["file"], labels_format="list") print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start)) t_start = time() nxc_X2, nxc_Y_csrm = load_libsvm_file(v["file"], labels_format="csr_matrix") print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start)) assert np.array_equal(nxc_X1.indptr, nxc_X2.indptr) assert np.array_equal(nxc_X1.indices, nxc_X2.indices) assert np.array_equal(nxc_X1.data, nxc_X2.data) assert np.array_equal(nxc_X1.indptr, sk_X.indptr) assert np.array_equal(nxc_X1.indices, sk_X.indices) assert np.allclose(nxc_X1.data, sk_X.data) assert nxc_X1.shape[0] == nxc_Y_csrm.shape[0] assert len(nxc_Y_list) == len(sk_Y) for nxc_y, sk_y in zip(nxc_Y_list, sk_Y): assert len(nxc_y) == len(sk_y) assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
def test_load_libsvm(): datasets = { "eurlex-4k": { "file": "data/Eurlex/eurlex_test.txt", "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 5000, "offset": 1 } }, "amazonCat-13k": { "file": "data/AmazonCat/amazonCat_test.txt", "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 203882, "offset": 1 } }, "amazonCat-14k": { "file": "data/AmazonCat-14K/amazonCat-14K_test.txt", "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 597540, "offset": 1 } }, "wiki10-31k": { "file": "data/Wiki10/wiki10_test.txt", "sklearn_args": { "multilabel": True, "zero_based": True, "n_features": 101938, "offset": 1 } } } for d, v in datasets.items(): download_dataset(d, subset='test', format='bow') print("\n{} time comparison:".format(d)) t_start = time() sk_X, sk_Y = load_svmlight_file(v["file"], **v["sklearn_args"]) print( "\tsklearn.datasets.load_svmlight_file time: {}s".format(time() - t_start)) t_start = time() nxc_X, nxc_Y = load_libsvm_file(v["file"]) print("\tnapkinXC.datasets.load_libsvm_file time: {}s".format(time() - t_start)) assert np.array_equal(nxc_X.indptr, sk_X.indptr) assert np.array_equal(nxc_X.indices, sk_X.indices) assert np.allclose(nxc_X.data, sk_X.data) assert len(nxc_Y) == len(sk_Y) for nxc_y, sk_y in zip(nxc_Y, sk_Y): assert len(nxc_y) == len(sk_y) assert all(y1 == y2 for y1, y2 in zip(nxc_y, sk_y))
def pytest_configure(config): print("Downloading/checking test data...") download_dataset(TEST_DATASET, "train", root=TEST_DATA_PATH) download_dataset(TEST_DATASET, "test", root=TEST_DATA_PATH)