def obtain(dir_path): """ Downloads the dataset to ``dir_path``. """ dir_path = os.path.expanduser(dir_path) print 'Downloading the dataset' import urllib urllib.urlretrieve('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz',os.path.join(dir_path,'cifar-10-python.tar.gz')) print 'Extracting the dataset (this could take a while)' import tarfile tf = tarfile.open(os.path.join(dir_path,'cifar-10-python.tar.gz')) tf.extractall(dir_path) import shutil shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_1'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_2'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_3'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_4'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_5'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/test_batch'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/readme.html'),dir_path) shutil.move(os.path.join(dir_path,'cifar-10-batches-py/batches.meta'),dir_path) os.rmdir(os.path.join(dir_path,'cifar-10-batches-py')) # Putting stuff in ascii files to enable not loading in memory batch1 = mlio.load(os.path.join(dir_path,'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path,'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path,'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path,'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path,'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path,'test_batch')) train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']]) train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']]) valid_data = batch5['data'] valid_labels = batch5['labels'] test_data = testbatch['data'] test_labels = testbatch['labels'] def write_to_file(data,labels,file): f = open(os.path.join(dir_path,file),'w') for input,label in zip(data,labels): f.write(' '.join([str(xi) for xi in input]) + ' ' + str(label) + '\n') f.close() write_to_file(train_data,train_labels,'cifar-10-train.txt') write_to_file(valid_data,valid_labels,'cifar-10-valid.txt') write_to_file(test_data,test_labels,'cifar-10-test.txt') print 'Done '
if load_to_memory: data = [x for x in data] length = len(data) else: length = 0 stream = open(dataFile) for l in stream: length += 1 stream.close() # Get metadata data_meta = {'input_size': input_size, 'length': length} return (data, data_meta) best_model = mlio.load(sys.argv[0]) print best_model.__class__ if best_model.__class__ == mlpython.learners.topic_modeling.DocNADE: inputSize = best_model.voc_size else: print "Model object is not supported" sys.exit() # Print the higher representation (hidden units) for each example. data, metadata = load(sys.argv[1], inputSize) for input in data: print " ".join([ "%.8f" % i for i in best_model.compute_document_representation(input) ])
# Get data data = mlio.load_from_file(dataFile,load_line) if load_to_memory: data = [x for x in data] length = len(data) else: length = 0 stream = open(dataFile) for l in stream: length+=1 stream.close() # Get metadata data_meta = {'input_size':input_size,'length':length} return (data,data_meta) best_model = mlio.load(sys.argv[0]) print best_model.__class__ if best_model.__class__ == mlpython.learners.topic_modeling.DocNADE: inputSize = best_model.voc_size else: print "Model object is not supported" sys.exit() # Print the higher representation (hidden units) for each example. data,metadata = load(sys.argv[1],inputSize) for input in data: print " ".join([ "%.8f"%i for i in best_model.compute_document_representation(input)])
def load(dir_path, load_to_memory=False): """ Loads the CIFAR-10 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size = 3072 dir_path = os.path.expanduser(dir_path) if load_to_memory: batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path, 'test_batch')) train_data = np.vstack( [batch1['data'], batch2['data'], batch3['data'], batch4['data']]) train_labels = np.hstack([ batch1['labels'], batch2['labels'], batch3['labels'], batch4['labels'] ]) train = mlio.IteratorWithFields( np.hstack([train_data, train_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) valid_data = batch5['data'] valid_labels = np.array(batch5['labels']) valid = mlio.IteratorWithFields( np.hstack([valid_data, valid_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) test_data = testbatch['data'] test_labels = np.array(testbatch['labels']) test = mlio.IteratorWithFields( np.hstack([test_data, test_labels.reshape(-1, 1)]), ((0, input_size), (input_size, input_size + 1))) else: def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1])) train_file, valid_file, test_file = [ os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train', 'valid', 'test'] ] # Get data train, valid, test = [ mlio.load_from_file(f, load_line) for f in [train_file, valid_file, test_file] ] # Get metadata lengths = [40000, 10000, 10000] other_meta = mlio.load(os.path.join(dir_path, 'batches.meta')) label_names = other_meta['label_names'] targets = set(label_names) class_to_id = {} for i, c in enumerate(label_names): class_to_id[c] = i train_meta, valid_meta, test_meta = [{ 'input_size': input_size, 'length': l, 'targets': targets, 'class_to_id': class_to_id } for l in lengths] return { 'train': (train, train_meta), 'valid': (valid, valid_meta), 'test': (test, test_meta) }
def obtain(dir_path): """ Downloads the dataset to ``dir_path``. """ dir_path = os.path.expanduser(dir_path) print 'Downloading the dataset' import urllib urllib.urlretrieve( 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', os.path.join(dir_path, 'cifar-10-python.tar.gz')) print 'Extracting the dataset (this could take a while)' import tarfile tf = tarfile.open(os.path.join(dir_path, 'cifar-10-python.tar.gz')) tf.extractall(dir_path) import shutil shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_1'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_2'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_3'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_4'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_5'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/test_batch'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/readme.html'), dir_path) shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/batches.meta'), dir_path) os.rmdir(os.path.join(dir_path, 'cifar-10-batches-py')) # Putting stuff in ascii files to enable not loading in memory batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path, 'test_batch')) train_data = np.vstack( [batch1['data'], batch2['data'], batch3['data'], batch4['data']]) train_labels = np.hstack([ batch1['labels'], batch2['labels'], batch3['labels'], batch4['labels'] ]) valid_data = batch5['data'] valid_labels = batch5['labels'] test_data = testbatch['data'] test_labels = testbatch['labels'] def write_to_file(data, labels, file): f = open(os.path.join(dir_path, file), 'w') for input, label in zip(data, labels): f.write(' '.join([str(xi) for xi in input]) + ' ' + str(label) + '\n') f.close() write_to_file(train_data, train_labels, 'cifar-10-train.txt') write_to_file(valid_data, valid_labels, 'cifar-10-valid.txt') write_to_file(test_data, test_labels, 'cifar-10-test.txt') print 'Done '
def load(dir_path,load_to_memory=False): """ Loads the CIFAR-10 dataset. The data is given by a dictionary mapping from strings ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata. **Defined metadata:** * ``'input_size'`` * ``'length'`` * ``'targets'`` * ``'class_to_id'`` """ input_size=3072 dir_path = os.path.expanduser(dir_path) if load_to_memory: batch1 = mlio.load(os.path.join(dir_path,'data_batch_1')) batch2 = mlio.load(os.path.join(dir_path,'data_batch_2')) batch3 = mlio.load(os.path.join(dir_path,'data_batch_3')) batch4 = mlio.load(os.path.join(dir_path,'data_batch_4')) batch5 = mlio.load(os.path.join(dir_path,'data_batch_5')) testbatch = mlio.load(os.path.join(dir_path,'test_batch')) train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']]) train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']]) train = mlio.IteratorWithFields(np.hstack([train_data,train_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) valid_data = batch5['data'] valid_labels = np.array(batch5['labels']) valid = mlio.IteratorWithFields(np.hstack([valid_data,valid_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) test_data = testbatch['data'] test_labels = np.array(testbatch['labels']) test = mlio.IteratorWithFields(np.hstack([test_data,test_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1))) else: def load_line(line): tokens = line.split() return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1])) train_file,valid_file,test_file = [os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train','valid','test']] # Get data train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]] # Get metadata lengths = [40000,10000,10000] other_meta = mlio.load(os.path.join(dir_path,'batches.meta')) label_names = other_meta['label_names'] targets = set(label_names) class_to_id = {} for i,c in enumerate(label_names): class_to_id[c] = i train_meta,valid_meta,test_meta = [{'input_size':input_size, 'length':l,'targets':targets, 'class_to_id':class_to_id} for l in lengths] return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}