Ejemplo n.º 1
0
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10')

    print('Loading CIFAR-10 train dataset...')
    train = CIFAR10(which_set='train')

    print("Preparing output directory...")
    output_dir = data_dir + '/pylearn2_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining a labeled
    dataset of a 32x32 approximately whitened version of the STL-10
    dataset. train.pkl contains labeled train examples. test.pkl
    contains labeled test examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_cifar10_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor and preprocessing \
          the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    train.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the unsupervised data')
    train.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', train)

    print("Loading the test data")
    test = CIFAR10(which_set='test')

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
Ejemplo n.º 2
0
def get_labels_and_fold_indices(cifar10, cifar100, stl10):
    assert stl10 or cifar10 or cifar100
    assert stl10+cifar10+cifar100 == 1

    if stl10:
        print 'loading entire stl-10 train set just to get the labels and folds'
        stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl")
        train_y = stl10.y

        fold_indices = stl10.fold_indices
    elif cifar10 or cifar100:
        if cifar10:
            print 'loading entire cifar10 train set just to get the labels'
            cifar = CIFAR10(which_set = 'train')
        else:
            assert cifar100
            print 'loading entire cifar100 train set just to get the labels'
            cifar = CIFAR100(which_set = 'train')
            cifar.y = cifar.y_fine
        train_y = cifar.y
        assert train_y is not None

        fold_indices = np.zeros((5,40000),dtype='uint16')
        idx_list = np.cast['uint16'](np.arange(1,50001)) #mimic matlab format of stl10
        for i in xrange(5):
            mask = idx_list < i * 10000 + 1
            mask += idx_list >= (i+1) * 10000 + 1
            fold_indices[i,:] = idx_list[mask]
        assert fold_indices.min() == 1
        assert fold_indices.max() == 50000


    return train_y, fold_indices
Ejemplo n.º 3
0
 def get_test_set(self):
     return CIFAR10(which_set='test',
                    center=self.center,
                    rescale=self.rescale,
                    gcn=self.gcn,
                    one_hot=self.one_hot,
                    toronto_prepro=self.toronto_prepro,
                    axes=self.axes)
Ejemplo n.º 4
0
def get_valid(ds, limit_size=-1, fold=0):
    if ds == 'mnist':
        data = MNIST('train', start=50000, stop=60000)
        return data.X[:limit_size]
    elif ds == 'tfd':
        data = TFD('valid', fold=fold, scale=True)
        return data.X
    elif ds == 'cifar10':
        data = CIFAR10(which_set='train', start=4000, stop=50000, gcn=55.)
        return data.X[:limit_size]
    else:
        raise ValueError("Unknow dataset: {}".format(args.dataet))
Ejemplo n.º 5
0
def get_test_labels(cifar10, stl10):
    assert cifar10 or stl10
    assert not (cifar10 and stl10)

    if stl10:
        print 'loading entire stl-10 test set just to get the labels'
        stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/test.pkl")
        return stl10.y
    if cifar10:
        print 'loading entire cifar10 test set just to get the labels'
        cifar10 = CIFAR10(which_set = 'test')
        return np.asarray(cifar10.y)
Ejemplo n.º 6
0
    def test_iterator(self):
        # Tests that batches returned by an iterator with topological
        # data_specs are the same as the ones returned by calling
        # get_topological_view on the dataset with the corresponding order
        batch_size = 100
        b01c_X = self.test.X[0:batch_size, :]
        b01c_topo = self.test.get_topological_view(b01c_X)
        b01c_b01c_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(32, 32),
                                    num_channels=3,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        b01c_b01c = b01c_b01c_it.next()
        assert np.all(b01c_topo == b01c_b01c)

        c01b_test = CIFAR10(which_set='test', axes=('c', 0, 1, 'b'))
        c01b_X = c01b_test.X[0:batch_size, :]
        c01b_topo = c01b_test.get_topological_view(c01b_X)
        c01b_c01b_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(32, 32),
                                    num_channels=3,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        c01b_c01b = c01b_c01b_it.next()
        assert np.all(c01b_topo == c01b_c01b)

        # Also check that samples from iterators with the same data_specs
        # with Conv2DSpace do not depend on the axes of the dataset
        b01c_c01b_it = self.test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(32, 32),
                                    num_channels=3,
                                    axes=('c', 0, 1, 'b')),
                        'features'))
        b01c_c01b = b01c_c01b_it.next()
        assert np.all(b01c_c01b == c01b_c01b)

        c01b_b01c_it = c01b_test.iterator(
            mode='sequential',
            batch_size=batch_size,
            data_specs=(Conv2DSpace(shape=(32, 32),
                                    num_channels=3,
                                    axes=('b', 0, 1, 'c')),
                        'features'))
        c01b_b01c = c01b_b01c_it.next()
        assert np.all(c01b_b01c == b01c_b01c)
Ejemplo n.º 7
0
def get_test_labels(cifar10, cifar100, stl10):
    assert cifar10 + cifar100 +  stl10 == 1

    if stl10:
        print 'loading entire stl-10 test set just to get the labels'
        stl10 = serial.load("${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/test.pkl")
        return stl10.y
    if cifar10:
        print 'loading entire cifar10 test set just to get the labels'
        cifar10 = CIFAR10(which_set = 'test')
        return np.asarray(cifar10.y)
    if cifar100:
        print 'loading entire cifar100 test set just to get the fine labels'
        cifar100 = CIFAR100(which_set = 'test')
        return np.asarray(cifar100.y_fine)
    assert False
Ejemplo n.º 8
0
 def test_topo_c01b(self):
     """
     Tests that a topological batch with axes ('c',0,1,'b')
     can be dimshuffled back to match the standard ('b',0,1,'c')
     format.
     """
     batch_size = 100
     c01b_test = CIFAR10(which_set='test', axes=('c', 0, 1, 'b'))
     c01b_X = c01b_test.X[0:batch_size, :]
     c01b = c01b_test.get_topological_view(c01b_X)
     assert c01b.shape == (3, 32, 32, batch_size)
     b01c = c01b.transpose(3, 1, 2, 0)
     b01c_X = self.test.X[0:batch_size, :]
     assert c01b_X.shape == b01c_X.shape
     assert np.all(c01b_X == b01c_X)
     b01c_direct = self.test.get_topological_view(b01c_X)
     assert b01c_direct.shape == b01c.shape
     assert np.all(b01c_direct == b01c)
Ejemplo n.º 9
0
def _get_dataset(dataset, no_imgs):
    """ Dump machine learning dataset into C source file and create a header file with
    array declaration and few macro definition
    """
    dataset = dataset.lower()
    if (dataset not in dataset_factory['name']):
        print('Dataset is not in the factory')
        sys.exit()

    if (dataset == 'mnist'):
        print('Reading MNIST dataset from pylearn2 database')
        data = MNIST(which_set='test')
        if (no_imgs > data.X.shape[0]):
            cprint(
                'Only {:d} images are available in this dataset. Dumping only those many'
                .format(data.X.shape[0]), 'red')
        data_x = data.X[0:min(no_imgs, data.X.shape[0])]
        data_y = data.y[0:min(no_imgs, data.y.shape[0])]
        data_x = np.uint8(data_x * 255)  # conversion from 0 -> 0.99 to 0->255

    elif (dataset == 'cifar10'):
        print('Reading CIFAR-10 dataset from pylearn2 database')
        data = CIFAR10(which_set='test')

        if (no_imgs > data.X.shape[0]):
            cprint(
                'Only {:d} images are available in this dataset. Dumping only those many'
                .format(data.X.shape[0]), 'red')
        data_x = data.X[0:min(no_imgs, data.X.shape[0])]
        data_y = data.y[0:min(no_imgs, data.y.shape[0])]
        data_x = np.uint8(data_x)  # already in the range 0 -> 255

    elif (dataset == 'svhn'):
        cprint('Not supported', 'red')
        return

    return data_x, data_y
Ejemplo n.º 10
0
 def setUp(self):
     skip_if_no_data()
     self.test = CIFAR10(which_set='test')
Ejemplo n.º 11
0
 def test_topo(self):
     """Tests that a topological batch has 4 dimensions"""
     train = CIFAR10(which_set='train')
     topo = train.get_batch_topo(1)
     assert topo.ndim == 4
Ejemplo n.º 12
0
    model = serial.load(model_path)
    model.set_dtype('float32')

    stl10 = False
    cifar10 = False
    stl10 = model.dataset_yaml_src.find('stl10') != -1
    cifar10 = model.dataset_yaml_src.find('cifar10') != -1
    cifar100 = model.dataset_yaml_src.find('cifar100') != -1
    if cifar100:
        cifar10 = False
    assert int(cifar10) + int(cifar100) + int(stl10) == 1

    print 'loading dataset'
    if cifar10:
        print 'CIFAR10 detected'
        dataset = CIFAR10(which_set="train")
    elif cifar100:
        print 'CIFAR100 detected'
        dataset = CIFAR100(which_set='train')
    elif stl10:
        print 'STL10 detected'
        dataset = serial.load(
            '${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl')
    X = dataset.get_design_matrix()[batch_start:batch_start + batch_size, :]

    size = np.sqrt(model.nvis / 3)

    if cifar10 or cifar100:
        pv1 = make_viewer((X - 127.5) / 127.5, is_color=True, rescale=False)
    elif stl10:
        pv1 = make_viewer(X / 127.5, is_color=True, rescale=False)
Ejemplo n.º 13
0
def main():
    # BN parameters
    batch_size = 200
    print("batch_size = " + str(batch_size))
    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # BinaryOut
    activation = binary_net.binary_tanh_unit
    print("activation = binary_net.binary_tanh_unit")
    # activation = binary_net.binary_sigmoid_unit
    # print("activation = binary_net.binary_sigmoid_unit")

    # BinaryConnect
    binary = True
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Training parameters
    num_epochs = 500
    print("num_epochs = " + str(num_epochs))

    # Decaying LR
    LR_start = 0.01
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    train_set_size = 45000
    print("train_set_size = " + str(train_set_size))
    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    print('\nLoading CIFAR-10 dataset...')

    train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
    valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000)
    test_set = CIFAR10(which_set="test")

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32))
    valid_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32))
    test_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32))

    # flatten targets
    train_set.y = np.hstack(train_set.y)
    valid_set.y = np.hstack(valid_set.y)
    test_set.y = np.hstack(test_set.y)

    if oneHot:
        #  Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.
    else:
        train_set.y = np.int32(train_set.y)
        valid_set.y = np.int32(valid_set.y)
        test_set.y = np.int32(test_set.y)

    #import pdb;pdb.set_trace()

    print('\nBuilding the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    if oneHot: target = T.matrix('targets')
    else: target = T.ivector('targets')

    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = buildCNN(dataType='cifar10',
                   networkType='cifar10',
                   oneHot=oneHot,
                   input=input,
                   epsilon=epsilon,
                   alpha=alpha,
                   activation=activation,
                   binary=binary,
                   stochastic=stochastic,
                   H=H,
                   W_LR_scale=W_LR_scale)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    if oneHot: loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))
    else:
        loss = LO.categorical_crossentropy(train_output, target)
        loss = loss.mean()

    # W updates
    W = lasagne.layers.get_all_params(cnn, binary=True)
    W_grads = binary_net.compute_grads(loss, cnn)
    updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                   params=W,
                                   learning_rate=LR)
    updates = binary_net.clipping_scaling(updates, cnn)

    # other parameters updates
    params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False)
    updates = OrderedDict(updates.items() + lasagne.updates.adam(
        loss_or_grads=loss, params=params, learning_rate=LR).items())

    test_output = lasagne.layers.get_output(cnn, deterministic=True)

    if oneHot:
        test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
        test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                                T.argmax(target, axis=1)),
                          dtype=theano.config.floatX)
    else:
        test_loss = LO.categorical_crossentropy(test_output, target)
        test_loss = test_loss.mean()
        test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                                T.argmax(target)),
                          dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(train_fn,
                     val_fn,
                     cnn,
                     batch_size,
                     LR_start,
                     LR_decay,
                     num_epochs,
                     train_set.X,
                     train_set.y,
                     valid_set.X,
                     valid_set.y,
                     test_set.X,
                     test_set.y,
                     shuffle_parts=shuffle_parts)
def load_dataset(dataset):
    if (dataset == "CIFAR-10"):

        print('Loading CIFAR-10 dataset...')

        train_set_size = 45000
        train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
        valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000)
        test_set = CIFAR10(which_set="test")

        train_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1))
        valid_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1))
        test_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)),(0,2,3,1))
        # flatten targets
        train_set.y = np.hstack(train_set.y)
        valid_set.y = np.hstack(valid_set.y)
        test_set.y = np.hstack(test_set.y)

        # Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.
        # enlarge train data set by mirrroring
        x_train_flip = train_set.X[:, :, ::-1, :]
        y_train_flip = train_set.y
        train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0)
        train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0)

    elif (dataset == "MNIST"):

        print('Loading MNIST dataset...')

        train_set_size = 50000
        train_set = MNIST(which_set="train", start=0, stop=train_set_size)
        valid_set = MNIST(which_set="train", start=train_set_size, stop=60000)
        test_set = MNIST(which_set="test")

        train_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 1, 28, 28)),(0,2,3,1))
        valid_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 1,  28, 28)),(0,2,3,1))
        test_set.X = np.transpose(np.reshape(np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 1,  28, 28)),(0,2,3,1))
        # flatten targets
        train_set.y = np.hstack(train_set.y)
        valid_set.y = np.hstack(valid_set.y)
        test_set.y = np.hstack(test_set.y)

        # Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.
        # enlarge train data set by mirrroring
        x_train_flip = train_set.X[:, :, ::-1, :]
        y_train_flip = train_set.y
        train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0)
        train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0)




    else:
        print("wrong dataset given")

    return train_set, valid_set, test_set
Ejemplo n.º 15
0
    ignore, model_path, data_override = sys.argv



model = serial.load(model_path)

# Get access to the intermediate layers of the augmented DBM
if hasattr(model, 'super_dbm'):
    model = model.super_dbm


if hasattr(model,'dataset_yaml_src'):
    dataset = yaml_parse.load(model.dataset_yaml_src)
else:
    from pylearn2.datasets.cifar10 import CIFAR10
    dataset = CIFAR10(which_set = 'test', gcn = 55.)

rng = np.random.RandomState([2012,10,24])
if data_override == 'binary_noise':
    dataset.X = rng.uniform(0., 1., dataset.X.shape) > 0.5
elif data_override == 'gaussian_noise':
    dataset.X = rng.randn( * dataset.X.shape).astype(dataset.X.dtype)


batch_size = 25
model.set_batch_size(batch_size)
perc = .99
num_examples = 50000
num_layers = len(model.hidden_layers)
num_filters = []
act_record = []
Ejemplo n.º 16
0
"""
This script makes a dataset of 32x32 contrast normalized, approximately
whitened CIFAR-10 images.

"""

from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.utils import string_utils
from pylearn2.datasets.cifar10 import CIFAR10

data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10')

print 'Loading CIFAR-10 train dataset...'
train = CIFAR10(which_set='train', gcn=55.)

print "Preparing output directory..."
output_dir = data_dir + '/pylearn2_gcn_whitened'
serial.mkdir(output_dir)
README = open(output_dir + '/README', 'w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

train.pkl, and test.pkl each contain
a pylearn2 Dataset object defining a labeled
dataset of a 32x32 contrast normalized, approximately whitened version of the STL-10
dataset. train.pkl contains labeled train examples. test.pkl
contains labeled test examples.
Ejemplo n.º 17
0
    num_epochs = 10
    print("num_epochs = " + str(num_epochs))
    # Decaying LR
    LR_start = 0.001
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    if args.dataset == 'cifar10':
        print('Loading CIFAR-10 dataset...')
        from pylearn2.datasets.cifar10 import CIFAR10
        train_set = CIFAR10(which_set="train", start=0, stop=45000)
        valid_set = CIFAR10(which_set="train", start=45000, stop=50000)
        test_set = CIFAR10(which_set="test")
        classes = 10
        save_path = "../weights/cifar10-w1a1.npz"
        print("save_path = " + str(save_path))
        train_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., train_set.X), 1.),
            (-1, 3, 32, 32))
        valid_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., valid_set.X), 1.),
            (-1, 3, 32, 32))
        test_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., test_set.X), 1.),
            (-1, 3, 32, 32))
Ejemplo n.º 18
0
        return self.predict_fn_(X)

import sys

f = open(sys.argv[2])
l = f.readlines()
f.close()
s = '\n'.join(l)
args = eval(s)


clf = TheanoSGDClassifier(10, ** args)
from pylearn2.datasets.cifar10 import CIFAR10
X = np.load(sys.argv[1])
X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3])
y = CIFAR10(which_set="train").y.astype(int)
print 'fit'
clf.fit(X, y)

del X
del y

y = np.asarray(CIFAR10(which_set="test").y).astype(int)
print 'loading test data'
X = np.load(sys.argv[3])
X = X.reshape(X.shape[0], X.shape[1] * X.shape[2] * X.shape[3])

print 'evaluating svm'
yhat = clf.predict(X)

print (yhat == y).mean()
Ejemplo n.º 19
0
    def __init__(self,
                 which_set,
                 center=False,
                 rescale=False,
                 gcn=None,
                 one_hot=False,
                 start=None,
                 stop=None,
                 axes=('b', 0, 1, 'c'),
                 toronto_prepro=False,
                 preprocessor=None):

        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest = 10000

        # we also expose the following details:
        self.img_shape = (3, 32, 32)
        self.img_size = np.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = [
            'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
            'horse', 'ship', 'truck'
        ]

        #         # prepare loading
        #         fnames = ['data_batch_%i' % i for i in range(1,6)]
        #         lenx = np.ceil((ntrain + nvalid) / 10000.)*10000
        #         x = np.zeros((lenx,self.img_size), dtype=dtype)
        #         y = np.zeros(lenx, dtype=dtype)
        #
        #         # load train data
        #         nloaded = 0
        #         for i, fname in enumerate(fnames):
        #             data = CIFAR10._unpickle(fname)
        #             x[i*10000:(i+1)*10000, :] = data['data']
        #             y[i*10000:(i+1)*10000] = data['labels']
        #             nloaded += 10000
        #             if nloaded >= ntrain + nvalid + ntest: break;
        #
        #         # load test data
        #         data = CIFAR10._unpickle('test_batch')
        #
        #         # process this data
        #         Xs = {
        #                 'train' : x[0:ntrain],
        #                 'test'  : data['data'][0:ntest]
        #             }
        #
        #         Ys = {
        #                 'train' : y[0:ntrain],
        #                 'test'  : data['labels'][0:ntest]
        #             }

        if which_set == 'train':

            #             pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+
            #                                  'cifar10/pylearn2_gcn_whitened/train.pkl')
            #pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+
            #         'cifar10/pylearn2_gcn_whitened/test.pkl')
            #X = pkl.X
            #y = pkl.y

            X = np.load(os.environ['PYLEARN2_DATA_PATH'] +
                        '/cifar10/train_X.npy')
            y = np.load(os.environ['PYLEARN2_DATA_PATH'] +
                        '/cifar10/train_y.npy')
            X = np.cast['float32'](X)
            y = np.cast['float32'](y)

        elif which_set == 'test':
            #             pkl = self._unpickle(os.environ['PYLEARN2_DATA_PATH']+
            #                                  'cifar10/pylearn2_gcn_whitened/test.pkl')
            #             X = pkl.X
            #             y = pkl.y
            X = np.load(os.environ['PYLEARN2_DATA_PATH'] +
                        '/cifar10/test_X.npy')
            y = np.load(os.environ['PYLEARN2_DATA_PATH'] +
                        '/cifar10/test_y.npy')
            X = np.cast['float32'](X)
            y = np.cast['float32'](y)


#         X = np.cast['float32'](Xs[which_set])
#         y = Ys[which_set]

        if which_set == 'test':
            assert X.shape[0] == 10000

        if isinstance(y, list):
            y = np.asarray(y)

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = CIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't change the pixel
            # means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop]
            assert X.shape[0] == y.shape[0]

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)

        if which_set == 'train':
            length = X.shape[0]

            def search_right_label(desired_label, i):
                for idx in xrange(i, length):
                    if y[idx] == desired_label:
                        return idx

            def swap_ele(index, i):
                x_tmp = X[i]
                X[i] = X[index]
                X[index] = x_tmp

                y_tmp = y[i]
                y[i] = y[index]
                y[index] = y_tmp

            desired_label = 0
            for i in xrange(length):
                desired_label = i % 10
                if y[i] != desired_label:
                    index = search_right_label(desired_label, i)
                    swap_ele(index, i)

            for i in xrange(length - 100, length):
                print y[i]

        self.one_hot = one_hot
        if one_hot:
            one_hot = np.zeros((y.shape[0], 10), dtype='float32')
            for i in xrange(y.shape[0]):
                one_hot[i, y[i]] = 1.
            y = one_hot

        super(My_CIFAR10, self).__init__(X=X,
                                         y=y,
                                         view_converter=view_converter)

        assert not np.any(np.isnan(self.X))

        if preprocessor:
            preprocessor.apply(self)
Ejemplo n.º 20
0
from pylearn2.datasets.cifar10 import CIFAR10
import time
import warnings
from theano.printing import Print
import numpy as np
from galatea.dbm.inpaint.super_dbm import SuperDBM
from galatea.dbm.inpaint.super_dbm import GaussianConvolutionalVisLayer
from galatea.dbm.inpaint.super_dbm import ConvMaxPool
from galatea.dbm.inpaint.super_dbm import Softmax
from pylearn2.utils import serial
from theano.gof.op import get_debug_values
from theano.printing import min_informative_str

dataset = CIFAR10(which_set='train', one_hot=True, gcn=55.)

rng = np.random.RandomState([2012, 07, 24])
irange = .05
nvis = 784
nclass = 10
nhid = 500
mf_iter = 10
batch_size = 100
lr = .001
momentum = 1. / 20.

from pylearn2.utils import sharedX
import theano.tensor as T

X = dataset.X
y = dataset.y
Ejemplo n.º 21
0
def loadDataset(type, oneHot=True):
    if type == 'TCDTIMIT':
        nbClasses = 39
        # get the database
        # If it's small (lipspeakers) -> generate X_train, y_train etc here
        # otherwise we need to load and generate each speaker seperately in the training loop
        dataset = "TCDTIMIT"
        root_dir = os.path.join(
            os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset))
        results_dir = root_dir + "/results/CNN_binaryNet"
        if not os.path.exists(results_dir): os.makedirs(results_dir)

        database_binaryDir = root_dir + '/binary'
        datasetType = "lipspeakers"  # "lipspeakers" #"volunteers" #"volunteers" #    lipspeakers or volunteers"
        ##############################################

        if datasetType == "lipspeakers":
            loadPerSpeaker = False  # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here
            storeProcessed = True
            processedDir = database_binaryDir + "_allLipspeakersProcessed"

            # TODO: prepLip_all can be used to generate pkl containing all the lipspeaker data. Not sure if this stil works, so use with care!
            pkl_path = processedDir + os.sep + datasetType + "_oneHot" + ".pkl"
            if not os.path.exists(pkl_path):
                print("dataset not yet processed. Processing...")
                preprocessLipreading.prepLip_all(data_path=database_binaryDir,
                                                 store_path=pkl_path,
                                                 trainFraction=0.7,
                                                 validFraction=0.1,
                                                 testFraction=0.2,
                                                 nbClasses=nbClasses,
                                                 onehot=oneHot,
                                                 type=datasetType,
                                                 verbose=True)
            X_train, y_train, X_val, y_val, X_test, y_test = general_tools.unpickle(
                pkl_path)
            dtypeX = 'float32'
            dtypeY = 'int32'
            X_train = X_train.astype(dtypeX)
            y_train = y_train.astype(dtypeY)
            X_val = X_val.astype(dtypeX)
            y_val = y_val.astype(dtypeY)
            X_test = X_test.astype(dtypeX)
            y_test = y_test.astype(dtypeY)

            #import pdb;pdb.set_trace()
            if oneHot:
                #  Onehot the targets
                y_train = np.float32(np.eye(nbClasses)[y_train])
                y_val = np.float32(np.eye(nbClasses)[y_val])
                y_test = np.float32(np.eye(nbClasses)[y_test])

                # for hinge loss
                y_train = 2 * y_train - 1.
                y_val = 2 * y_val - 1.
                y_test = 2 * y_test - 1.

            #import pdb;pdb.set_trace()

    else:  # cifar10
        nbClasses = 10
        train_set_size = 45000
        print("train_set_size = " + str(train_set_size))
        train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
        valid_set = CIFAR10(which_set="train",
                            start=train_set_size,
                            stop=50000)
        test_set = CIFAR10(which_set="test")

        # bc01 format
        # Inputs in the range [-1,+1]
        # print("Inputs in the range [-1,+1]")
        train_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., train_set.X), 1.),
            (-1, 3, 32, 32))
        valid_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., valid_set.X), 1.),
            (-1, 3, 32, 32))
        test_set.X = np.reshape(
            np.subtract(np.multiply(2. / 255., test_set.X), 1.),
            (-1, 3, 32, 32))

        # flatten targets
        train_set.y = np.hstack(train_set.y)
        valid_set.y = np.hstack(valid_set.y)
        test_set.y = np.hstack(test_set.y)

        # Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.

        X_train = train_set.X
        y_train = train_set.y
        X_val = valid_set.X
        y_val = valid_set.y
        X_test = test_set.X
        y_test = test_set.y

    return X_train, y_train, X_val, y_val, X_test, y_test
def load_dataset(which_set, dataset_types):

    # we need to have at least 2 types otherwise this func is useless
    assert len(dataset_types) > 1
    print "loading.. ", which_set

    if which_set == 'test':
        start_set = 0
        stop_set = 10000
    elif which_set == 'valid':
        which_set = 'train'
        start_set = 40000
        stop_set = 50000
    else:
        #train
        start_set = 0
        stop_set = 40000

    n_classes = 10

    data = []
    for prepro in dataset_types:

        if prepro == 'gcn':
            print "LOADING GCN..."
            input_data = CIFAR10(which_set=which_set,
                                 start=start_set,
                                 stop=stop_set,
                                 gcn=55.,
                                 axes=['b', 0, 1, 'c'])
            # gcn_data = input_data.get_topological_view()
            data.append(input_data.get_topological_view())

        if prepro == 'toronto':
            print "LOADING TOR..."
            input_data = CIFAR10(which_set=which_set,
                                 start=start_set,
                                 stop=stop_set,
                                 axes=['b', 0, 1, 'c'],
                                 toronto_prepro=1)
            # tor_data = input_data.get_topological_view()
            data.append(input_data.get_topological_view())

        if prepro == 'zca':
            print "LOADING ZCA..."

            data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10')
            input_data = ZCA_Dataset(
                preprocessed_dataset=serial.load(data_dir +
                                                 "/pylearn2_gcn_whitened/" +
                                                 which_set + ".pkl"),
                preprocessor=serial.load(
                    data_dir + "/pylearn2_gcn_whitened/preprocessor.pkl"),
                start=start_set,
                stop=stop_set,
                axes=['b', 0, 1, 'c'])
            # zca_data = input_data.get_topological_view()
            data.append(input_data.get_topological_view())

    target_data = OneHotFormatter(n_classes).format(input_data.y,
                                                    mode="concatenate")
    data.append(target_data)

    data_source = []
    for i in range(len(dataset_types)):
        data_source.append('features' + str(i))
    data_source.append('targets')

    ################################## DEFINE SPACES ##################################
    spaces = []
    # add input spaces as b01c
    for i in range(0, len(dataset_types)):
        spaces.append(
            Conv2DSpace(shape=(32, 32), num_channels=3, axes=('b', 0, 1, 'c')))
    # add output space
    spaces.append(VectorSpace(n_classes))

    set = VectorSpacesDataset(tuple(data),
                              (CompositeSpace(spaces), tuple(data_source)))

    return set
        exit(-1)
    top_dir = os.environ['CRAFT_BNN_ROOT']
    params_dir = top_dir + '/params'

    # BinaryOut
    activation = hardware_net.SignTheano
    print("activation = sign(x)")

    no_bias = True
    print("no_bias = " + str(no_bias))

    # BinaryConnect
    H = 1.
    print('Loading CIFAR-10 dataset...')

    test_set = CIFAR10(which_set="test")
    print("Test set size = " + str(len(test_set.X)))
    test_instances = 10000
    print("Using instances 0 .. " + str(test_instances))

    # bc01 format
    # Inputs in the range [-1,+1]
    test_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32))
    # flatten targets
    test_set.y = np.hstack(test_set.y)
    # Onehot the targets
    test_set.y = np.float32(np.eye(10)[test_set.y])
    # for hinge loss
    test_set.y = 2 * test_set.y - 1.
Ejemplo n.º 24
0
from pylearn2.datasets.cifar10 import CIFAR10
from pylearn2.gui.patch_viewer import PatchViewer

dataset = CIFAR10(which_set='test')

pv = PatchViewer((10, 1), (32, 32), is_color=True)

T, y = dataset.get_batch_topo(10, include_labels=True)

for i in xrange(10):
    print dataset.label_names[y[i]]
    pv.add_patch(dataset.adjust_for_viewer(T[i, :, :, :]), rescale=False)

pv.show()
Ejemplo n.º 25
0
def get_test_data():
    print 'loading entire cifar-10 dataset'
    cifar10 = CIFAR10(which_set='test')
    return cifar10
Ejemplo n.º 26
0
def load_dataset(dataset, train_set_size, val_stop):
    if (dataset == "CIFAR-10"):
        '''
        (x_train, y_train), (x_test, y_test) = mnist.load_data()

        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        
        input_shape = (img_rows, img_cols, 1)

        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        x_train /= 255
        x_test /= 255
        
        y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.np_utils.to_categorical(y_test, num_classes)
        '''
        print('Loading CIFAR-10 dataset...')

        #train_set.X = np.transpose(np.reshape(np.multiply(255., train_set.X).astype('uint8'), (-1, 1,  28, 28)), (0,2,3,1))

        #train_set_size = 45000
        train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
        valid_set = CIFAR10(which_set="train",
                            start=train_set_size,
                            stop=val_stop)  #50000)
        test_set = CIFAR10(which_set="test")

        train_set.X = np.transpose(np.reshape(train_set.X, (-1, 3, 32, 32)),
                                   (0, 2, 3, 1))
        valid_set.X = np.transpose(np.reshape(valid_set.X, (-1, 3, 32, 32)),
                                   (0, 2, 3, 1))
        test_set.X = np.transpose(np.reshape(test_set.X, (-1, 3, 32, 32)),
                                  (0, 2, 3, 1))

        # flatten targets
        train_set.y = np.hstack(train_set.y)
        valid_set.y = np.hstack(valid_set.y)
        test_set.y = np.hstack(test_set.y)

        # Onehot the targets
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.

        # enlarge train data set by mirrroring
        x_train_flip = train_set.X[:, :, ::-1, :]
        y_train_flip = train_set.y
        train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0)
        train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0)

    elif (dataset == "MNIST"):
        print('Loading MNIST dataset...')

        #train_set_size = 50000
        #This images are between 0 and 1, 1D array
        train_set = MNIST(which_set="train",
                          start=0,
                          stop=train_set_size,
                          shuffle=False)
        valid_set = MNIST(which_set="train",
                          start=train_set_size,
                          stop=val_stop,
                          shuffle=False)  #60000)
        test_set = MNIST(which_set="test", shuffle=False)

        #train_set.X = np.multiply(2. / 255., train_set.X)
        #print train_set.X[0]
        #raw_input()
        #train_set.X = np.subtract(train_set.X, 1.)
        #print train_set.X[0]
        #raw_input()
        #train_set.X = np.reshape(train_set.X, (-1, 1, 28, 28))
        #print train_set.X[0]
        #raw_input()
        #train_set.X = np.transpose(train_set.X,(0,2,3,1))

        #remove  / 255.  / 255.  / 255.
        #the -1 means check how many elements exists ( I have no time to count them:-))
        # num images x 1 x28x28
        #1 dimension (grey color)
        #28x28x1
        #the transpose is used to obtain num images x 28 x 28 x 1

        train_set.X = np.transpose(np.reshape(train_set.X, (-1, 1, 28, 28)),
                                   (0, 2, 3, 1))
        valid_set.X = np.transpose(np.reshape(valid_set.X, (-1, 1, 28, 28)),
                                   (0, 2, 3, 1))
        test_set.X = np.transpose(np.reshape(test_set.X, (-1, 1, 28, 28)),
                                  (0, 2, 3, 1))

        # flatten targets
        #From array of one elements to one array for all picture: [1,2,2,8,8,3,...

        train_set.y = np.hstack(train_set.y)
        valid_set.y = np.hstack(valid_set.y)
        test_set.y = np.hstack(test_set.y)

        # Onehot the targets
        # Each element becomes an array with only one 1 and the rest 0
        train_set.y = np.float32(np.eye(10)[train_set.y])
        valid_set.y = np.float32(np.eye(10)[valid_set.y])
        test_set.y = np.float32(np.eye(10)[test_set.y])

        # for hinge loss
        train_set.y = 2 * train_set.y - 1.
        valid_set.y = 2 * valid_set.y - 1.
        test_set.y = 2 * test_set.y - 1.

        # enlarge train data set by mirrroring
        x_train_flip = train_set.X[:, :, ::-1, :]
        y_train_flip = train_set.y
        train_set.X = np.concatenate((train_set.X, x_train_flip), axis=0)
        train_set.y = np.concatenate((train_set.y, y_train_flip), axis=0)

        #image=test_set.X[800]
        #print image.shape
        #image=test_set.X[800].reshape([28, 28])
        #print test_set.X.shape
        #print train_set.X.shape
        #print valid_set.X.shape
        #plt.figure()
        #plt.imshow(image,cmap='gray')
        #plt.show(block=False)
        #plt.pause(0.05)
        #raw_input()
    else:
        print("wrong dataset given")

    return train_set, valid_set, test_set
Ejemplo n.º 27
0
Archivo: vis.py Proyecto: vd114/galatea
print "Loading dataset"
from pylearn2.datasets.cifar10 import CIFAR10
dataset = CIFAR10(which_set='train', axes=('c', 0, 1, 'b'))

print "Building graph"
rows = 10
cols = 10
m = rows * cols
from pylearn2.space import Conv2DSpace

space = Conv2DSpace([32, 32], num_channels=3, axes=('c', 0, 1, 'b'))

X = space.make_batch_theano()

from galatea.maxout import GCN_C01B2

gcn = GCN_C01B2()
gcn.set_input_space(space)

normed = gcn.fprop(X)

from galatea.maxout import OnlineWhitener

whitener = OnlineWhitener()

whitener.set_input_space(gcn.get_output_space())

white = whitener.fprop(normed)

assert white.ndim == 4
Ejemplo n.º 28
0
"""
This script makes a dataset of 32x32 approximately whitened CIFAR-10 images.

"""

from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.utils import string_utils
import numpy as np
from pylearn2.datasets.cifar10 import CIFAR10

data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar10')

print 'Loading CIFAR-10 train dataset...'
train = CIFAR10(which_set='train')

print "Preparing output directory..."
output_dir = data_dir + '/pylearn2_whitened'
serial.mkdir(output_dir)
README = open(output_dir + '/README', 'w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

train.pkl, and test.pkl each contain
a pylearn2 Dataset object defining a labeled
dataset of a 32x32 approximately whitened version of the STL-10
dataset. train.pkl contains labeled train examples. test.pkl
contains labeled test examples.
Ejemplo n.º 29
0
    print("N = " + str(N) + " Num_States = " + str(pow(2, N) + 1))
    th = 3.  #the nonlinearity parameter of state transfer probability
    print("tanh = " + str(th))

    # Decaying LR
    LR_start = 0.01  #0.01
    print("LR_start = " + str(LR_start))
    LR_fin = 0.00003  # 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))

    print('Loading CIFAR10 dataset...')

    train_set_size = 45000
    train_set = CIFAR10(which_set="train", start=0, stop=train_set_size)
    valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000)
    test_set = CIFAR10(which_set="test")

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    '''
    train_set.X = train_set.X.reshape(-1,3,32,32)
    valid_set.X = valid_set.X.reshape(-1,3,32,32)
    test_set.X = test_set.X.reshape(-1,3,32,32)
    '''
    train_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255, train_set.X), 1.), (-1, 3, 32, 32))
    valid_set.X = np.reshape(
        np.subtract(np.multiply(2. / 255, valid_set.X), 1.), (-1, 3, 32, 32))
Ejemplo n.º 30
0
from pylearn2.utils import serial
from pylearn2.datasets.cifar10 import CIFAR10
import numpy as np
import sys
from pylearn2.models.svm import DenseMulticlassSVM

print 'Loading labels'
y = CIFAR10(which_set='train', one_hot=True).y

ignore, features_path = sys.argv

print 'Loading features'
X = serial.load(features_path)
X = X.astype('float64')  # avoid duplicating memory in sklearn


def train(X, y):
    print 'Training SVM...'
    return DenseMulticlassSVM(C=1.,
                              kernel='linear').fit(X, np.argmax(y, axis=1))


X_train = X[0:40000, :]
y_train = y[0:40000, :]

W = train(X_train, y_train)


def acc(W, X, y):
    y_hat = W.predict()
    y = np.argmax(y, axis=1)