def load(dir_path,load_to_memory=False,fold=1): """ Loads the LETOR 4.0 MQ2007 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. This dataset comes with 5 predefined folds, which can be specified with option ``fold`` (default = 1). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'length'`` """ input_size=46 dir_path = os.path.expanduser(dir_path) sparse=False if fold not in [1,2,3,4,5]: raise error('There are 5 predefined folds. Option fold should be an integer between 1 and 5') def convert(feature,value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line,convert,int,sparse,input_size) n_queries = [ [ 1017, 339, 336 ], [ 1017, 336, 339 ], [ 1014, 339, 339 ], [ 1014, 339, 339 ], [ 1014, 339, 339 ] ] lengths = [ [42158, 13813, 13652], [41958, 13652, 14013], [41320, 14013, 14290], [41478, 14290, 13855], [41955, 13855, 13813] ] # Get data file paths train_file,valid_file,test_file = [os.path.join(dir_path, 'MQ2007/Fold' + str(fold) + '/' + ds + '.txt') for ds in ['train','vali','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths[fold-1])] train_meta,valid_meta,test_meta = [{'input_size':input_size, 'scores':range(3), 'n_queries':nq, 'length':l, 'n_pairs':l} for nq,l in zip(n_queries[fold-1],lengths[fold-1])] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(file_path,input_size=13649,load_to_memory=True): """ Loads LIBSVM dataset in path ``file_path``. """ dataFile = os.path.expanduser(file_path) def load_line(line): return mlio.libsvm_load_line(line, convert_target=str, sparse=True, input_size=input_size, input_type=np.int32)[0] # Get data data = mlio.load_from_file(dataFile,load_line) if load_to_memory: data = [x for x in data] length = len(data) else: length = 0 stream = open(dataFile) for l in stream: length+=1 stream.close() # Get metadata data_meta = {'input_size':input_size,'length':length} return (data,data_meta)
def load(dir_path,load_to_memory=False): """ Loads the NIPS 0-12 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size=500 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens[:-1]]) #The last element is bogus (don't ask...) train_file,valid_file,test_file = [os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [400,100,1240] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,)],[np.float64],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads a binarized version of MNIST. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size=784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens]) train_file,valid_file,test_file = [os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [50000,10000,10000] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,)],[np.float64],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the Abalone dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 8 dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'abalone_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [3341, 418, 418] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the DNA dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 180 dir_path = os.path.expanduser(dir_path) targets = set([0, 1, 2]) target_mapping = {'1': 0, '2': 1, '3': 2} def convert_target(target): return target_mapping[target] def load_line(line): return mlio.libsvm_load_line(line, convert_target=convert_target, sparse=False, input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [1400, 600, 1186] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs and binary targets. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 150 dir_path = os.path.expanduser(dir_path) targets = set([0, 1]) target_mapping = {'0': 0, '1': 1} def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40000, 10000, 150000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the Rectangles images dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 dir_path = os.path.expanduser(dir_path) targets = set(range(2)) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), float(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rectangles_images_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [10000, 2000, 50000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the Housing dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 13 #targets = set(range(2)) #targets = set([0,1]) #target_mapping = {'-1':0,'+1':1} dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) #return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'housing_' + ds + '.libsvm') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [404, 51, 51] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata #train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the OCR letters dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 128 targets = set(range(26)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() seq_size = len(tokens) / (input_size + 1) input = np.zeros((seq_size, input_size)) target = -1 * np.ones((seq_size), dtype=int) example = np.array([int(i) for i in tokens]).reshape((seq_size, -1)) input[:] = example[:, :input_size] target[:] = example[:, input_size] return input, target train_file, valid_file, test_file = [ os.path.join(dir_path, 'ocr_letters_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [5502, 688, 687] if load_to_memory: train = [example for example in train] valid = [example for example in valid] test = [example for example in test] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=True, sparse=False, binary_input=False): """ Loads the 20 news groups dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 2000 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 2000 targets = set(range(20)) dir_path = os.path.expanduser(dir_path) def convert_target(target_str): return int(target_str) - 1 def load_line(line): return mlio.libsvm_load_line(line, convert_target=convert_target, sparse=sparse, input_size=input_size, input_type=np.int32) # Get data train_file, valid_file, test_file = [ os.path.join(dir_path, ds + '.txt') for ds in ['train', 'valid', 'test'] ] train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] if load_to_memory: train, valid, test = [[x for x in f] for f in [train, valid, test]] lengths = [10284, 1000, 7502] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'targets': targets, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the OCR letters dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf OCR dataset (web page) link: http://www.seas.upenn.edu/~taskar/ocr/ """ input_size = 128 targets = set(range(26)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) #return mlio.libsvm_load_line(line,float,int,sparse,input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'ocr_letters_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [32152, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, fold=1): """ Loads the LETOR 4.0 MQ2008 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. This dataset comes with 5 predefined folds, which can be specified with option ``fold`` (default = 1). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'length'`` """ input_size = 46 dir_path = os.path.expanduser(dir_path) sparse = False if fold not in [1, 2, 3, 4, 5]: raise error("There are 5 predefined folds. Option fold should be an integer between 1 and 5") def convert(feature, value): if feature != "qid": raise ValueError("Unexpected feature") return int(value) def load_line(line): return mlio.libsvm_load_line(line, convert, int, sparse, input_size) n_queries = [[471, 157, 156], [471, 156, 157], [470, 157, 157], [470, 157, 157], [470, 157, 157]] lengths = [[9630, 2707, 2874], [9404, 2874, 2933], [8643, 2933, 3635], [8514, 3635, 3062], [9442, 3062, 2707]] # Get data file paths train_file, valid_file, test_file = [ os.path.join(dir_path, "MQ2008/Fold" + str(fold) + "/" + ds + ".txt") for ds in ["train", "vali", "test"] ] # Get data train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size,), (1,), (1,)], [np.float64, int, int], l) for d, l in zip([train, valid, test], lengths[fold - 1]) ] train_meta, valid_meta, test_meta = [ {"input_size": input_size, "scores": range(3), "n_queries": nq, "length": l, "n_pairs": l} for nq, l in zip(n_queries[fold - 1], lengths[fold - 1]) ] return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the NIPS 0-12 dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size = 500 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens[:-1] ]) #The last element is bogus (don't ask...) train_file, valid_file, test_file = [ os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [400, 100, 1240] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [dtype], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been normalized between 0 and 1. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 targets = set(range(10)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) #return mlio.libsvm_load_line(line,float,int,sparse,input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'mnist_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the occluded MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs and targets have been converted to a binary format. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 784 target_size = 784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:input_size]]), np.array([int(i) for i in tokens[input_size:]])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'occluded_mnist_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (target_size, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'target_size': target_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the 20-newsgroups dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 5000 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 5000 targets = set(range(20)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [9578, 1691, 7505] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads a binarized version of MNIST. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'length' Reference: On the Quantitative Analysis of Deep Belief Networks Salakhutdinov and Murray link: http://www.mit.edu/~rsalakhu/papers/dbn_ais.pdf The MNIST database of handwritten digits (web page) Yann LeCun and Corinna Cortes link: http://yann.lecun.com/exdb/mnist/ """ input_size = 784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens]) train_file, valid_file, test_file = [ os.path.join(dir_path, 'binarized_mnist_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [dtype], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ SARCOS inverse dynamics dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 21 target_size = 7 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:input_size]]), np.array([float(i) for i in tokens[input_size:]])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'sarcos_' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40036, 4448, 4449] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (target_size, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'target_size': target_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the CAData (California housing prices) dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 8 dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, 'cadata_' + ds + '.libsvm') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [16512, 2064, 2064] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load_data(dir_path, input_size=6, targets=set(['0','1','2','3','4']), train_filename=None, test_filename=None, background_filename=None, load_to_memory=True): """ Loads a dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ import mlpython.misc.io as mlio # Known metadata dir_path = os.path.expanduser(dir_path) # Look if the train/valid/test files already exist, if not, load the data and create the files train_file, valid_file, finaltrain_file, test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['trainset','validset','finaltrainset','testset']] if os.path.exists(train_file): print "Train/valid/test files exist, loading data..." else: print "Train/valid/test file do not exist, creating them..." if train_filename is None or test_filename is None: print 'ERROR, NO TRAIN/TEST FILENAMES GIVEN' sys.exit(1) else: create_files(dir_path,train_filename,test_filename,background_filename,input_size) # train/valid/test files should exist by now if load_to_memory: train_data, valid_data, finaltrain_data, test_data = [mlio.libsvm_load(filename=f, input_size=input_size)[0] for f in [train_file, valid_file, finaltrain_file, test_file]] else: def load_line(line): return mlio.libsvm_load_line(line,input_size=input_size) train_data, valid_data, finaltrain_data, test_data = [mlio.load_from_file(filename=f,load_line=load_line) for f in [train_file, valid_file, finaltrain_file, test_file]] # Get metadata with open(os.path.join(dir_path,'metadata.txt'),'r') as f: train_meta,valid_meta,finaltrain_meta,test_meta = [{'input_size':input_size,'length':int(f.readline()[:-1]),'targets':targets} for i in range(4)] test_meta['len_bg'] = int(f.readline()[:-1]) label_weights = {} for _ in range(len(targets)): label, weight = f.readline()[:-1].split(':') label_weights[label] = float(weight) finaltrain_meta['label_weights'] = label_weights return {'train':(train_data,train_meta),'valid':(valid_data,valid_meta), 'finaltrain':(finaltrain_data,finaltrain_meta),'test':(test_data,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the NIPS 0-12 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 500 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return np.array([int(i) for i in tokens[:-1] ]) #The last element is bogus (don't ask...) train_file, valid_file, test_file = [ os.path.join(dir_path, 'nips-0-12_all_shuffled_bidon_target_' + ds + '.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [400, 100, 1240] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, )], [np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path, load_to_memory=False): """ Loads the Corel5k dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 499 target_size = 374 dir_path = os.path.expanduser(dir_path) def convert_target(target_str): targets = np.zeros((target_size)) for l in target_str.split(","): id = int(l) targets[id] = 1 return targets def load_line(line): return mlio.libsvm_load_line(line, convert_target=convert_target, sparse=False, input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, "corel5k_" + ds + ".libsvm") for ds in ["train", "valid", "test"] ] # Get data train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]] lengths = [3600, 900, 500] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size,), (target_size,)], [np.float64, bool], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [ {"input_size": input_size, "target_size": target_size, "length": l} for l in lengths ] return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
def load(dir_path,load_to_memory=True,sparse=False,binary_input=False): """ Loads the RCV2 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 10000 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=10000 target_size=103 dir_path = os.path.expanduser(dir_path) def convert_target(target_str): targets = np.zeros((target_size)) for l in target_str.split(','): id = int(l) targets[id] = 1 return targets def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=sparse,input_size=input_size,input_type=np.int32) # Get data train_file,valid_file,test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['train','valid','test']] train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] if load_to_memory: train,valid,test = [ [x for x in f] for f in [train,valid,test] ] #lengths = [784414,10000,10000] lengths = [392207,10000,402207] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'targets':set(range(2)),'target_size':target_size,'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the occluded MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs and targets have been converted to a binary format. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size = 784 target_size = 784 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:input_size]]), np.array([int(i) for i in tokens[input_size:]])) train_file, valid_file, test_file = [ os.path.join(dir_path, "occluded_mnist_" + ds + ".txt") for ds in ["train", "valid", "test"] ] # Get data train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]] lengths = [50000, 10000, 10000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size,), (target_size,)], [np.float64, np.float64], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [ {"input_size": input_size, "target_size": target_size, "length": l} for l in lengths ] return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
def load(dir_path,load_to_memory=False,dtype=np.float64): """ Loads the DNA dataset. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size=180 dir_path = os.path.expanduser(dir_path) targets = set([0,1,2]) target_mapping = {'1':0,'2':1,'3':2} def convert_target(target): return target_mapping[target] def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'dna_scale_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [1400,600,1186] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[dtype,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the MajMin dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size=389 target_size=96 dir_path = os.path.expanduser(dir_path) def convert_target(target_str): targets = np.zeros((target_size)) if target_str != '': for l in target_str.split(','): id = int(l) targets[id] = 1 return targets def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'majmin_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [1587,471,480] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the MNIST background-random dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 dir_path = os.path.expanduser(dir_path) targets = set(range(10)) def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(float(tokens[-1]))) train_file, valid_file, test_file = [ os.path.join(dir_path, "mnist_background_random_" + ds + ".amat") for ds in ["train", "valid", "test"] ] # Get data train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]] lengths = [10000, 2000, 50000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size,), (1,)], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{"input_size": input_size, "length": l, "targets": targets} for l in lengths] return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the MTurk dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size=389 target_size=95 dir_path = os.path.expanduser(dir_path) def convert_target(target_str): targets = np.zeros((target_size)) if target_str != '': for l in target_str.split(','): id = int(l) targets[id] = 1 return targets def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'mturk_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [597,123,195] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,bool],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs and binary targets. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=150 dir_path = os.path.expanduser(dir_path) targets = set([0,1]) target_mapping = {'0':0,'1':1} def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [40000,10000,150000] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the Mushrooms dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=112 dir_path = os.path.expanduser(dir_path) targets = set([0,1]) target_mapping = {'1':0,'2':1} def convert_target(target): return target_mapping[target] def load_line(line): return mlio.libsvm_load_line(line,convert_target=convert_target,sparse=False,input_size=input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'mushrooms_' + ds + '.libsvm') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [2000,500,5624] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the 20-newsgroups dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 5000 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=5000 targets = set(range(20)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, '20newsgroups_' + ds + '_binary_5000_voc.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [9578,1691,7505] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the MNIST dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been normalized between 0 and 1. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=784 targets = set(range(10)) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1])) #return mlio.libsvm_load_line(line,float,int,sparse,input_size) train_file,valid_file,test_file = [os.path.join(dir_path, 'mnist_' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [50000,10000,10000] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Corrupted OCR letters dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs and targets are binary. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size=16*(32+3) target_size=16*(32+3) dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:input_size]]), np.array([float(i) for i in tokens[input_size:]])) train_file,valid_file,test_file = [os.path.join(dir_path, 'corrupted_ocr_letters_' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [10000,2000,2000] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Labeled Faces in the Wild, occluded faces dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs and targets have been converted to be in the [0,1] interval. **Defined metadata:** * ``'input_size'`` * ``'target_size'`` * ``'length'`` """ input_size=1024 target_size=1024 dir_path = os.path.expanduser(dir_path) def load_line(line): tokens = line.split() return (np.array([float(i)/255 for i in tokens[:input_size]]), np.array([float(i)/255 for i in tokens[input_size:]])) train_file,valid_file,test_file = [os.path.join(dir_path, 'occluded_faces_lfw_' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [11089,1149,1117] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(target_size,)],[np.float64,np.float64],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size,'target_size':target_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path,load_to_memory=False): """ Loads the Rectangles images dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size = 784 dir_path = os.path.expanduser(dir_path) targets = set(range(2)) def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), float(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, 'rectangles_images_' + ds + '.amat') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] lengths = [10000, 2000, 50000] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,)],[np.float64,int],l) for d,l in zip([train,valid,test],lengths)] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l, 'targets':targets} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the CAData (California housing prices) dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` """ input_size = 8 dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line, float, float, sparse=False, input_size=input_size) train_file, valid_file, test_file = [ os.path.join(dir_path, "cadata_" + ds + ".libsvm") for ds in ["train", "valid", "test"] ] # Get data train, valid, test = [mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file]] lengths = [16512, 2064, 2064] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size,), (1,)], [np.float64, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{"input_size": input_size, "length": l} for l in lengths] return {"train": (train, train_meta), "valid": (valid, valid_meta), "test": (test, test_meta)}
def load(dir_path,load_to_memory=True,sparse=False): """ Loads the NIPS abstracts dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. The inputs have been put in binary format, and the vocabulary has been restricted to 13649 words. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ input_size=13649 dir_path = os.path.expanduser(dir_path) def load_line(line): return mlio.libsvm_load_line(line,convert_target=str,sparse=sparse,input_size=input_size,input_type=np.int32)[0] # Get data train_file,valid_file,test_file = [os.path.join(dir_path, ds + '.txt') for ds in ['train','valid','test']] train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] if load_to_memory: train,valid,test = [ [x for x in f] for f in [train,valid,test] ] lengths = [1640,50,50] # Get metadata train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the the 32 x 32 pixels version of the Street View House Numbers (SVHN) dataset. The original 32 x 32 pixels dataset is in color, but is converted in grayscale by this module, in [0,1]. The original training set is also split into a new training set and a validation set. Finally, the original dataset also includes extra labeled examples, which are supposed to be easier to classifier. Those were are added to the training set. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size = 1024 dir_path = os.path.expanduser(dir_path) # Put in grayscale, in [0,1] def to_grayscale_normalized(example): x, y = example new_x = (x[:input_size] * 0.3 + x[input_size:(2 * input_size)] * 0.59 + x[(2 * input_size):(3 * input_size)] * 0.11) / 255. return (new_x, y) class TransformedIterator: def __init__(self, iter, transform): self.iter = iter self.transform = transform def __iter__(self): for ex in self.iter: yield self.transform(ex) if load_to_memory: train_inputs = np.load(os.path.join(dir_path, 'train_inputs_32x32.npy')) valid_inputs = np.load(os.path.join(dir_path, 'valid_inputs_32x32.npy')) test_inputs = np.load(os.path.join(dir_path, 'test_inputs_32x32.npy')) train_targets = np.load( os.path.join(dir_path, 'train_targets_32x32.npy')) valid_targets = np.load( os.path.join(dir_path, 'valid_targets_32x32.npy')) test_targets = np.load(os.path.join(dir_path, 'test_targets_32x32.npy')) train = TransformedIterator( mlio.IteratorWithFields( np.hstack([train_inputs, train_targets.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))), to_grayscale_normalized) valid = TransformedIterator( mlio.IteratorWithFields( np.hstack([valid_inputs, valid_targets.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))), to_grayscale_normalized) test = TransformedIterator( mlio.IteratorWithFields( np.hstack([test_inputs, test_targets.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))), to_grayscale_normalized) else: def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, ds + '_32x32.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ TransformedIterator(mlio.load_from_file(f, load_line), to_grayscale_normalized) for f in [train_file, valid_file, test_file] ] # Get metadata lengths = [594388, 10000, 26032] targets = set(range(1, 11)) class_to_id = {} for t in range(10): class_to_id[t + 1] = t train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets, 'class_to_id': class_to_id } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path,load_to_memory=False): """ Loads the the 32 x 32 pixels version of the Street View House Numbers (SVHN) dataset. The original 32 x 32 pixels dataset is in color, but is converted in grayscale by this module, in [0,1]. The original training set is also split into a new training set and a validation set. Finally, the original dataset also includes extra labeled examples, which are supposed to be easier to classifier. Those were are added to the training set. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size=1024 dir_path = os.path.expanduser(dir_path) # Put in grayscale, in [0,1] def to_grayscale_normalized(example): x,y = example new_x = (x[:input_size]*0.3 + x[input_size:(2*input_size)]*0.59 + x[(2*input_size):(3*input_size)]*0.11)/255. return (new_x,y) class TransformedIterator: def __init__(self,iter,transform): self.iter = iter self.transform = transform def __iter__(self): for ex in self.iter: yield self.transform(ex) if load_to_memory: train_inputs = np.load(os.path.join(dir_path,'train_inputs_32x32.npy')) valid_inputs = np.load(os.path.join(dir_path,'valid_inputs_32x32.npy')) test_inputs = np.load(os.path.join(dir_path,'test_inputs_32x32.npy')) train_targets = np.load(os.path.join(dir_path,'train_targets_32x32.npy')) valid_targets = np.load(os.path.join(dir_path,'valid_targets_32x32.npy')) test_targets = np.load(os.path.join(dir_path,'test_targets_32x32.npy')) train = TransformedIterator(mlio.IteratorWithFields(np.hstack([train_inputs,train_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))), to_grayscale_normalized) valid = TransformedIterator(mlio.IteratorWithFields(np.hstack([valid_inputs,valid_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))), to_grayscale_normalized) test = TransformedIterator(mlio.IteratorWithFields(np.hstack([test_inputs,test_targets.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))), to_grayscale_normalized) else: def load_line(line): tokens = line.split() return (np.array([float(i) for i in tokens[:-1]]),int(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, ds + '_32x32.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [TransformedIterator(mlio.load_from_file(f,load_line),to_grayscale_normalized) for f in [train_file,valid_file,test_file]] # Get metadata lengths = [594388,10000,26032] targets = set(range(1,11)) class_to_id = {} for t in range(10): class_to_id[t+1] = t train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets, 'class_to_id':class_to_id} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False, dtype=np.float64): """ Loads the RCV1 dataset. This is actually a smaller version of it, with 150 inputs and binary targets. The data is given by a dictionary mapping from strings 'train', 'valid' and 'test' to the associated pair of data and metadata. Defined metadata: - 'input_size' - 'targets' - 'length' References: Tractable Multivariate Binary Density Estimation and the Restricted Boltzmann Forest Larochelle, Bengio and Turian link: http://www.cs.toronto.edu/~larocheh/publications/NECO-10-09-1100R2-PDF.pdf LIBSVM Data: Classification, Regression, and Multi-label (web page) link: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ """ input_size = 150 dir_path = os.path.expanduser(dir_path) targets = set([0, 1]) target_mapping = {'0': 0, '1': 1} def convert_target(target): return target_mapping[target] def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'rcv1_all_subset.binary_' + ds + '_voc_150.amat') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] lengths = [40000, 10000, 150000] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, )], [dtype, int], l) for d, l in zip([train, valid, test], lengths) ] # Get metadata train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path,load_to_memory=False): """ Loads the CIFAR-10 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size=3072 dir_path = os.path.expanduser(dir_path) if load_to_memory: batch1 = mlio.load(os.path.join(dir_path,'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path,'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path,'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path,'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path,'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path,'test_batch')) train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']]) train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']]) train = mlio.IteratorWithFields(np.hstack([train_data,train_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) valid_data = batch5['data'] valid_labels = np.array(batch5['labels']) valid = mlio.IteratorWithFields(np.hstack([valid_data,valid_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) test_data = testbatch['data'] test_labels = np.array(testbatch['labels']) test = mlio.IteratorWithFields(np.hstack([test_data,test_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) else: def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] # Get metadata lengths = [40000,10000,10000] other_meta = mlio.load(os.path.join(dir_path,'batches.meta')) label_names = other_meta['label_names'] targets = set(label_names) class_to_id = {} for i,c in enumerate(label_names): class_to_id[c] = i train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets, 'class_to_id':class_to_id} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False): """ Loads the CIFAR-10 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size = 3072 dir_path = os.path.expanduser(dir_path) if load_to_memory: batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path, 'test_batch')) train_data = np.vstack( [batch1['data'], batch2['data'], batch3['data'], batch4['data']]) train_labels = np.hstack([ batch1['labels'], batch2['labels'], batch3['labels'], batch4['labels'] ]) train = mlio.IteratorWithFields( np.hstack([train_data, train_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) valid_data = batch5['data'] valid_labels = np.array(batch5['labels']) valid = mlio.IteratorWithFields( np.hstack([valid_data, valid_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) test_data = testbatch['data'] test_labels = np.array(testbatch['labels']) test = mlio.IteratorWithFields( np.hstack([test_data, test_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) else: def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] # Get metadata lengths = [40000, 10000, 10000] other_meta = mlio.load(os.path.join(dir_path, 'batches.meta')) label_names = other_meta['label_names'] targets = set(label_names) class_to_id = {} for i, c in enumerate(label_names): class_to_id[c] = i train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets, 'class_to_id': class_to_id } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load(dir_path,load_to_memory=False,home_made_valid_split=False): """ Loads the Yahoo! Learning to Rank Challenge, Set 2 data. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. Option ``home_made_valid_split`` determines whether the original training set should be further split into a "home made" train/valid split (default: False). If True, the dictionary mapping will contain 4 keys instead of 3: ``'train'`` (home made training set), ``'valid'`` (home made validation set), ``'test'`` (original validation set) and ``'test2'`` (original test set). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'n_pairs'`` * ``'length'`` """ input_size=700 dir_path = os.path.expanduser(dir_path) sparse=False def convert(feature,value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line,convert,int,sparse,input_size) if home_made_valid_split: n_queries = [1000,266,1266,3798] lengths = [27244,7571,34881,103174] train_file,valid_file,test_file,test2_file = [os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['in_house_train','in_house_valid','valid','test']] # Get data train,valid,test,test2 = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file,test2_file]] if load_to_memory: train,valid,test,test2 = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test,test2],lengths)] # Get metadata train_meta,valid_meta,test_meta,test2_meta = [{'input_size':input_size, 'scores':range(5), 'n_queries':nq, 'length':l, 'n_pairs':l} for nq,l in zip(n_queries,lengths)] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta),'test2':(test2,test2_meta)} else: n_queries = [1266,1266,3798] lengths = [34815,34881,103174] # Get data file paths train_file,valid_file,test_file = [os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] if load_to_memory: train,valid,test = [mlio.MemoryDataset(d,[(input_size,),(1,),(1,)],[np.float64,int,int],l) for d,l in zip([train,valid,test],lengths)] train_meta,valid_meta,test_meta = [{'input_size':input_size, 'scores':range(5), 'n_queries':nq, 'length':l, 'n_pairs':l} for nq,l in zip(n_queries,lengths)] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}
def load(dir_path, load_to_memory=False, fold=1): """ Loads the LETOR 4.0 MQ2008 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. This dataset comes with 5 predefined folds, which can be specified with option ``fold`` (default = 1). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'length'`` """ input_size = 46 dir_path = os.path.expanduser(dir_path) sparse = False if fold not in [1, 2, 3, 4, 5]: raise error( 'There are 5 predefined folds. Option fold should be an integer between 1 and 5' ) def convert(feature, value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line, convert, int, sparse, input_size) n_queries = [[471, 157, 156], [471, 156, 157], [470, 157, 157], [470, 157, 157], [470, 157, 157]] lengths = [[9630, 2707, 2874], [9404, 2874, 2933], [8643, 2933, 3635], [8514, 3635, 3062], [9442, 3062, 2707]] # Get data file paths train_file, valid_file, test_file = [ os.path.join(dir_path, 'MQ2008/Fold' + str(fold) + '/' + ds + '.txt') for ds in ['train', 'vali', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )], [np.float64, int, int], l) for d, l in zip([train, valid, test], lengths[fold - 1]) ] train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'scores': range(3), 'n_queries': nq, 'length': l, 'n_pairs': l } for nq, l in zip(n_queries[fold - 1], lengths[fold - 1])] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def load_data(dir_path, input_size=6, targets=set(['0', '1', '2', '3', '4']), train_filename=None, test_filename=None, background_filename=None, load_to_memory=True): """ Loads a dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'targets'`` * ``'length'`` """ import mlpython.misc.io as mlio # Known metadata dir_path = os.path.expanduser(dir_path) # Look if the train/valid/test files already exist, if not, load the data and create the files train_file, valid_file, finaltrain_file, test_file = [ os.path.join(dir_path, ds + '.txt') for ds in ['trainset', 'validset', 'finaltrainset', 'testset'] ] if os.path.exists(train_file): print "Train/valid/test files exist, loading data..." else: print "Train/valid/test file do not exist, creating them..." if train_filename is None or test_filename is None: print 'ERROR, NO TRAIN/TEST FILENAMES GIVEN' sys.exit(1) else: create_files(dir_path, train_filename, test_filename, background_filename, input_size) # train/valid/test files should exist by now if load_to_memory: train_data, valid_data, finaltrain_data, test_data = [ mlio.libsvm_load(filename=f, input_size=input_size)[0] for f in [train_file, valid_file, finaltrain_file, test_file] ] else: def load_line(line): return mlio.libsvm_load_line(line, input_size=input_size) train_data, valid_data, finaltrain_data, test_data = [ mlio.load_from_file(filename=f, load_line=load_line) for f in [train_file, valid_file, finaltrain_file, test_file] ] # Get metadata with open(os.path.join(dir_path, 'metadata.txt'), 'r') as f: train_meta, valid_meta, finaltrain_meta, test_meta = [{ 'input_size': input_size, 'length': int(f.readline()[:-1]), 'targets': targets } for i in range(4)] test_meta['len_bg'] = int(f.readline()[:-1]) label_weights = {} for _ in range(len(targets)): label, weight = f.readline()[:-1].split(':') label_weights[label] = float(weight) finaltrain_meta['label_weights'] = label_weights return { 'train': (train_data, train_meta), 'valid': (valid_data, valid_meta), 'finaltrain': (finaltrain_data, finaltrain_meta), 'test': (test_data, test_meta) }
def load(dir_path, load_to_memory=False, home_made_valid_split=False): """ Loads the Yahoo! Learning to Rank Challenge, Set 2 data. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. Option ``home_made_valid_split`` determines whether the original training set should be further split into a "home made" train/valid split (default: False). If True, the dictionary mapping will contain 4 keys instead of 3: ``'train'`` (home made training set), ``'valid'`` (home made validation set), ``'test'`` (original validation set) and ``'test2'`` (original test set). **Defined metadata:** * ``'input_size'`` * ``'scores'`` * ``'n_queries'`` * ``'n_pairs'`` * ``'length'`` """ input_size = 700 dir_path = os.path.expanduser(dir_path) sparse = False def convert(feature, value): if feature != 'qid': raise ValueError('Unexpected feature') return int(value) def load_line(line): return mlio.libsvm_load_line(line, convert, int, sparse, input_size) if home_made_valid_split: n_queries = [1000, 266, 1266, 3798] lengths = [27244, 7571, 34881, 103174] train_file, valid_file, test_file, test2_file = [ os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['in_house_train', 'in_house_valid', 'valid', 'test'] ] # Get data train, valid, test, test2 = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file, test2_file] ] if load_to_memory: train, valid, test, test2 = [ mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )], [np.float64, int, int], l) for d, l in zip([train, valid, test, test2], lengths) ] # Get metadata train_meta, valid_meta, test_meta, test2_meta = [{ 'input_size': input_size, 'scores': range(5), 'n_queries': nq, 'length': l, 'n_pairs': l } for nq, l in zip(n_queries, lengths)] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta), 'test2': (test2, test2_meta) } else: n_queries = [1266, 1266, 3798] lengths = [34815, 34881, 103174] # Get data file paths train_file, valid_file, test_file = [ os.path.join(dir_path, 'set2.' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] if load_to_memory: train, valid, test = [ mlio.MemoryDataset(d, [(input_size, ), (1, ), (1, )], [np.float64, int, int], l) for d, l in zip([train, valid, test], lengths) ] train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'scores': range(5), 'n_queries': nq, 'length': l, 'n_pairs': l } for nq, l in zip(n_queries, lengths)] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }