Beispiel #1
0
def obtain(dir_path):
    """
    Downloads the dataset to ``dir_path``.
    """

    dir_path = os.path.expanduser(dir_path)
    print 'Downloading the dataset'
    import urllib
    urllib.urlretrieve('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz',os.path.join(dir_path,'cifar-10-python.tar.gz'))
    print 'Extracting the dataset (this could take a while)'
    import tarfile
    tf = tarfile.open(os.path.join(dir_path,'cifar-10-python.tar.gz'))
    tf.extractall(dir_path)
    import shutil
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_1'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_2'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_3'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_4'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/data_batch_5'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/test_batch'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/readme.html'),dir_path)
    shutil.move(os.path.join(dir_path,'cifar-10-batches-py/batches.meta'),dir_path)
    os.rmdir(os.path.join(dir_path,'cifar-10-batches-py'))

    # Putting stuff in ascii files to enable not loading in memory
    batch1 = mlio.load(os.path.join(dir_path,'data_batch_1'))
    batch2 = mlio.load(os.path.join(dir_path,'data_batch_2'))
    batch3 = mlio.load(os.path.join(dir_path,'data_batch_3'))
    batch4 = mlio.load(os.path.join(dir_path,'data_batch_4'))
    batch5 = mlio.load(os.path.join(dir_path,'data_batch_5'))
    testbatch = mlio.load(os.path.join(dir_path,'test_batch'))
    train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']])
    train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']])
    valid_data = batch5['data']
    valid_labels = batch5['labels']
    test_data = testbatch['data']
    test_labels = testbatch['labels']

    def write_to_file(data,labels,file):
        f = open(os.path.join(dir_path,file),'w')
        for input,label in zip(data,labels):
            f.write(' '.join([str(xi) for xi in input]) + ' ' + str(label) + '\n')
        f.close()

    write_to_file(train_data,train_labels,'cifar-10-train.txt')
    write_to_file(valid_data,valid_labels,'cifar-10-valid.txt')
    write_to_file(test_data,test_labels,'cifar-10-test.txt')

    print 'Done                     '
Beispiel #2
0
    if load_to_memory:
        data = [x for x in data]
        length = len(data)
    else:
        length = 0
        stream = open(dataFile)
        for l in stream:
            length += 1
        stream.close()

    # Get metadata
    data_meta = {'input_size': input_size, 'length': length}

    return (data, data_meta)


best_model = mlio.load(sys.argv[0])
print best_model.__class__
if best_model.__class__ == mlpython.learners.topic_modeling.DocNADE:
    inputSize = best_model.voc_size
else:
    print "Model object is not supported"
    sys.exit()

# Print the higher representation (hidden units) for each example.
data, metadata = load(sys.argv[1], inputSize)
for input in data:
    print " ".join([
        "%.8f" % i for i in best_model.compute_document_representation(input)
    ])
    # Get data
    data = mlio.load_from_file(dataFile,load_line)
    if load_to_memory:
        data = [x for x in data]
        length = len(data)
    else:
        length = 0
        stream = open(dataFile)
        for l in stream:
            length+=1
        stream.close()
            
    # Get metadata
    data_meta = {'input_size':input_size,'length':length}
    
    return (data,data_meta)


best_model = mlio.load(sys.argv[0])
print best_model.__class__
if best_model.__class__ == mlpython.learners.topic_modeling.DocNADE:
    inputSize = best_model.voc_size
else:
    print "Model object is not supported"
    sys.exit()

# Print the higher representation (hidden units) for each example.
data,metadata = load(sys.argv[1],inputSize)
for input in data:
    print " ".join([ "%.8f"%i for i in best_model.compute_document_representation(input)])
Beispiel #4
0
def load(dir_path, load_to_memory=False):
    """
    Loads the CIFAR-10 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size = 3072
    dir_path = os.path.expanduser(dir_path)
    if load_to_memory:
        batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1'))
        batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2'))
        batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3'))
        batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4'))
        batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5'))
        testbatch = mlio.load(os.path.join(dir_path, 'test_batch'))

        train_data = np.vstack(
            [batch1['data'], batch2['data'], batch3['data'], batch4['data']])
        train_labels = np.hstack([
            batch1['labels'], batch2['labels'], batch3['labels'],
            batch4['labels']
        ])
        train = mlio.IteratorWithFields(
            np.hstack([train_data, train_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

        valid_data = batch5['data']
        valid_labels = np.array(batch5['labels'])
        valid = mlio.IteratorWithFields(
            np.hstack([valid_data, valid_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

        test_data = testbatch['data']
        test_labels = np.array(testbatch['labels'])
        test = mlio.IteratorWithFields(
            np.hstack([test_data, test_labels.reshape(-1, 1)]),
            ((0, input_size), (input_size, input_size + 1)))

    else:

        def load_line(line):
            tokens = line.split()
            return (np.array([int(i) for i in tokens[:-1]]), int(tokens[-1]))

        train_file, valid_file, test_file = [
            os.path.join(dir_path, 'cifar-10-' + ds + '.txt')
            for ds in ['train', 'valid', 'test']
        ]
        # Get data
        train, valid, test = [
            mlio.load_from_file(f, load_line)
            for f in [train_file, valid_file, test_file]
        ]

    # Get metadata
    lengths = [40000, 10000, 10000]
    other_meta = mlio.load(os.path.join(dir_path, 'batches.meta'))
    label_names = other_meta['label_names']
    targets = set(label_names)
    class_to_id = {}
    for i, c in enumerate(label_names):
        class_to_id[c] = i

    train_meta, valid_meta, test_meta = [{
        'input_size': input_size,
        'length': l,
        'targets': targets,
        'class_to_id': class_to_id
    } for l in lengths]

    return {
        'train': (train, train_meta),
        'valid': (valid, valid_meta),
        'test': (test, test_meta)
    }
Beispiel #5
0
def obtain(dir_path):
    """
    Downloads the dataset to ``dir_path``.
    """

    dir_path = os.path.expanduser(dir_path)
    print 'Downloading the dataset'
    import urllib
    urllib.urlretrieve(
        'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz',
        os.path.join(dir_path, 'cifar-10-python.tar.gz'))
    print 'Extracting the dataset (this could take a while)'
    import tarfile
    tf = tarfile.open(os.path.join(dir_path, 'cifar-10-python.tar.gz'))
    tf.extractall(dir_path)
    import shutil
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_1'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_2'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_3'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_4'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/data_batch_5'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/test_batch'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/readme.html'),
                dir_path)
    shutil.move(os.path.join(dir_path, 'cifar-10-batches-py/batches.meta'),
                dir_path)
    os.rmdir(os.path.join(dir_path, 'cifar-10-batches-py'))

    # Putting stuff in ascii files to enable not loading in memory
    batch1 = mlio.load(os.path.join(dir_path, 'data_batch_1'))
    batch2 = mlio.load(os.path.join(dir_path, 'data_batch_2'))
    batch3 = mlio.load(os.path.join(dir_path, 'data_batch_3'))
    batch4 = mlio.load(os.path.join(dir_path, 'data_batch_4'))
    batch5 = mlio.load(os.path.join(dir_path, 'data_batch_5'))
    testbatch = mlio.load(os.path.join(dir_path, 'test_batch'))
    train_data = np.vstack(
        [batch1['data'], batch2['data'], batch3['data'], batch4['data']])
    train_labels = np.hstack([
        batch1['labels'], batch2['labels'], batch3['labels'], batch4['labels']
    ])
    valid_data = batch5['data']
    valid_labels = batch5['labels']
    test_data = testbatch['data']
    test_labels = testbatch['labels']

    def write_to_file(data, labels, file):
        f = open(os.path.join(dir_path, file), 'w')
        for input, label in zip(data, labels):
            f.write(' '.join([str(xi)
                              for xi in input]) + ' ' + str(label) + '\n')
        f.close()

    write_to_file(train_data, train_labels, 'cifar-10-train.txt')
    write_to_file(valid_data, valid_labels, 'cifar-10-valid.txt')
    write_to_file(test_data, test_labels, 'cifar-10-test.txt')

    print 'Done                     '
Beispiel #6
0
def load(dir_path,load_to_memory=False):
    """
    Loads the CIFAR-10 dataset.

    The data is given by a dictionary mapping from strings
    ``'train'``, ``'valid'`` and ``'test'`` to the associated pair of data and metadata.
    
    **Defined metadata:**
    
    * ``'input_size'``
    * ``'length'``
    * ``'targets'``
    * ``'class_to_id'``

    """

    input_size=3072
    dir_path = os.path.expanduser(dir_path)
    if load_to_memory:
        batch1 = mlio.load(os.path.join(dir_path,'data_batch_1'))
        batch2 = mlio.load(os.path.join(dir_path,'data_batch_2'))
        batch3 = mlio.load(os.path.join(dir_path,'data_batch_3'))
        batch4 = mlio.load(os.path.join(dir_path,'data_batch_4'))
        batch5 = mlio.load(os.path.join(dir_path,'data_batch_5'))
        testbatch = mlio.load(os.path.join(dir_path,'test_batch'))

        train_data = np.vstack([batch1['data'],batch2['data'],batch3['data'],batch4['data']])
        train_labels = np.hstack([batch1['labels'],batch2['labels'],batch3['labels'],batch4['labels']])
        train = mlio.IteratorWithFields(np.hstack([train_data,train_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

        valid_data = batch5['data']
        valid_labels = np.array(batch5['labels'])
        valid = mlio.IteratorWithFields(np.hstack([valid_data,valid_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

        test_data = testbatch['data']
        test_labels = np.array(testbatch['labels'])
        test = mlio.IteratorWithFields(np.hstack([test_data,test_labels.reshape(-1,1)]),((0,input_size),(input_size,input_size+1)))

    else:
        def load_line(line):
            tokens = line.split()
            return (np.array([int(i) for i in tokens[:-1]]),int(tokens[-1]))

        train_file,valid_file,test_file = [os.path.join(dir_path, 'cifar-10-' + ds + '.txt') for ds in ['train','valid','test']]
        # Get data
        train,valid,test = [mlio.load_from_file(f,load_line) for f in [train_file,valid_file,test_file]]

    # Get metadata
    lengths = [40000,10000,10000]
    other_meta = mlio.load(os.path.join(dir_path,'batches.meta'))
    label_names = other_meta['label_names']
    targets = set(label_names)
    class_to_id = {}
    for i,c in enumerate(label_names):
        class_to_id[c] = i
        
    train_meta,valid_meta,test_meta = [{'input_size':input_size,
                                        'length':l,'targets':targets,
                                        'class_to_id':class_to_id} for l in lengths]
    
    return {'train':(train,train_meta),'valid':(valid,valid_meta),'test':(test,test_meta)}