def _transform_zca(self, x, fit=False):
     if self._to_zca:
         if fit:
             self._zca_class = preprocessing.ZCA(store_inverse=True)
             self._zca_class.fit(x)
         x = self._zca_class.transform(x)
     return x
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL10-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))
    del supplement

    print("Preparing output directory...")
    patch_dir = data_dir + '/stl10_patches_8x8'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from a downsampled (to 32x32)
    version of the STL-10 train and unlabeled datasets.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_stl10_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8),
                          num_patches=2*1000*1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Esempio n. 3
0
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

    print('Loading CIFAR-100 train dataset...')
    train = CIFAR100(which_set='train', gcn=55.)

    print("Preparing output directory...")
    output_dir = data_dir + '/pylearn2_gcn_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining a labeled
    dataset of a 32x32 contrast normalized,
    approximately whitened version of the CIFAR-100 dataset.
    train.pkl contains labeled train examples.
    test.pkl contains labeled test examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_cifar100_gcn_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
           and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    train.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the training data')
    train.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', train)

    print("Loading the test data")
    test = CIFAR100(which_set='test', gcn=55.)

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')

    print('Loading CIFAR-100 train dataset...')
    data = CIFAR100(which_set='train')

    print("Preparing output directory...")
    patch_dir = data_dir + '/cifar100/cifar100_patches'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from the CIFAR-100 train set.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_cifar100_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(6, 6),
                                     num_patches=2 * 1000 * 1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Esempio n. 5
0
 def get_preprocess_zca(self, preprocess_id):
     row = self.db.executeSQL(
         """
     SELECT n_components, n_drop_components, filter_bias
     FROM hps3.preprocess_zca
     WHERE preprocess_id = %s
     """, (preprocess_id, ), self.db.FETCH_ONE)
     if not row or row is None:
         raise HPSData("No zca preprocess for preprocess_id="\
             +str(preprocess_id))
     (n_components, n_drop_components, filter_bias) = row
     return pp.ZCA(n_components=n_components,
                   filter_bias=filter_bias,
                   n_drop_components=n_drop_components)
def get_dataset_cifar10():
    """
    The orginal pipeline on cifar10 from pylearn2. Please refer to
    pylearn2/scripts/train_example/make_dataset.py for details.
    """

    train_path = 'cifar10_preprocessed_train.pkl'
    test_path = 'cifar10_preprocessed_test.pkl'

    if os.path.exists(train_path) and \
            os.path.exists(test_path):
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)
    else:
        print 'loading raw data...'
        trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train")
        testset =  cifar10.CIFAR10(which_set="test")

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        pipeline.items.append(
            preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000))

        pipeline.items.append(preprocessing.GlobalContrastNormalization())

        pipeline.items.append(preprocessing.ZCA())

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        trainset.use_design_loc('train_design.npy')

        testset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.use_design_loc('test_design.npy')

        print 'saving preprocessed data...'
        serial.save('cifar10_preprocessed_train.pkl', trainset)
        serial.save('cifar10_preprocessed_test.pkl', testset)

        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    # this path will be used for visualizing weights after training is done
    #global YAML
    return trainset, testset
Esempio n. 7
0
def generate_patches():
    datasets = OrderedDict()
    datasets['train'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=1,
                                                  stop=201)
    datasets['valid'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=201,
                                                  stop=283)
    datasets['test'] = GenderWrite.gwdata.GWData(which_set='test')
    datasets['tottrain'] = GenderWrite.gwdata.GWData(which_set='train')

    # preprocess patches
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.GlobalContrastNormalization())
    pipeline.items.append(preprocessing.ZCA())
    for dstr, dset in datasets.iteritems():
        print dstr
        # only fit on train data
        trainbool = dstr == 'train' or dstr == 'tottrain'
        dset.apply_preprocessor(preprocessor=pipeline, can_fit=trainbool)
        # save
        dset.use_design_loc(DATA_DIR + dstr + '_design.npy')
        serial.save(DATA_DIR + 'gw_preprocessed_' + dstr + '.pkl', dset)
Esempio n. 8
0
preprocessor.pkl contains a pylearn2 ZCA object that was used
to approximately whiten the images. You may want to use this
object later to preprocess other images.

They were created with the pylearn2 script make_cifar100_gcn_whitened.py.

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")

README.close()

print "Learning the preprocessor and preprocessing the unsupervised train data..."
preprocessor = preprocessing.ZCA()
train.apply_preprocessor(preprocessor = preprocessor, can_fit = True)

print 'Saving the unsupervised data'
train.use_design_loc(output_dir+'/train.npy')
serial.save(output_dir + '/train.pkl', train)

print "Loading the test data"
test = CIFAR100(which_set = 'test', gcn = 55.)

print "Preprocessing the test data"
test.apply_preprocessor(preprocessor = preprocessor, can_fit = False)

print "Saving the test data"
test.use_design_loc(output_dir+'/test.npy')
serial.save(output_dir+'/test.pkl', test)
to extract the patches and approximately whiten / contrast normalize
them. This object is necessary when extracting features for
supervised learning or test set classification, because the
extracted features must be computed using inputs that have been
whitened with the ZCA matrix learned and stored by this Pipeline.

They were created with the pylearn2 script make_cifar100_patches.py.

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")

README.close()

print("Preprocessing the data...")
pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8),
                                 num_patches=2 * 1000 * 1000))
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
pipeline.items.append(preprocessing.ZCA())
data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

data.use_design_loc(patch_dir + '/data.npy')

serial.save(patch_dir + '/data.pkl', data)

serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Esempio n. 10
0
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))

    print("Preparing output directory...")
    output_dir = data_dir + '/stl10_32x32_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining an unlabeled
    dataset of a 32x32 approximately whitened version of the STL-10
    dataset. unlabeled.pkl contains unlabeled train examples. train.pkl
    contains labeled train examples. unsupervised.pkl contains the union
    of these (without any labels). test.pkl contains the labeled test
    examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_stl10_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
          and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    data.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the unsupervised data')
    data.use_design_loc(output_dir + '/unsupervised.npy')
    serial.save(output_dir + '/unsupervised.pkl', data)

    X = data.X
    unlabeled = X[0:100 * 1000, :]
    labeled = X[100 * 1000:, :]
    del X

    print("Saving the unlabeled data")
    data.X = unlabeled
    data.use_design_loc(output_dir + '/unlabeled.npy')
    serial.save(output_dir + '/unlabeled.pkl', data)
    del data
    del unlabeled

    print("Saving the labeled train data")
    supplement.X = labeled
    supplement.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', supplement)
    del supplement
    del labeled

    print("Loading the test data")
    test = serial.load(downsampled_dir + '/test.pkl')

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
Esempio n. 11
0
def get_dataset(tot=False, preprocessor='normal'):
    if not os.path.exists(DATA_DIR+'train.npy') or \
        not os.path.exists(DATA_DIR+'test.npy') or \
        not os.path.exists(DATA_DIR+'targets.npy'):
        initial_read()
    
    train_path = DATA_DIR+'train_'+preprocessor+'_preprocessed.pkl'
    valid_path = DATA_DIR+'valid_'+preprocessor+'_preprocessed.pkl'
    tottrain_path = DATA_DIR+'tottrain_'+preprocessor+'_preprocessed.pkl'
    test_path = DATA_DIR+'test_'+preprocessor+'_preprocessed.pkl'
    
    if os.path.exists(train_path) and os.path.exists(valid_path) and os.path.exists(test_path):
        
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:
        
        print 'loading raw data...'
        trainset = Digits(which_set='train', start=0, stop=34000)
        validset = Digits(which_set='train', start=34000, stop=42000)
        tottrainset = Digits(which_set='train')
        testset = Digits(which_set='test')
        
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
        
        if preprocessor != 'nozca':
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        
        # note the can_fit=False's: no sharing between train and valid data
        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        
        if preprocessor not in ('normal','nozca'):
            for data in (trainset, validset, tottrainset, testset):
                for ii in range(data.X.shape[0]):
                    # normalize to [0,1]
                    dmax = np.max(data.X[ii,:])
                    dmin = np.min(data.X[ii,:])
                    dnorm = (data.X[ii,:] - dmin) / (dmax - dmin)
                    # and convert to PIL image
                    img = Image.fromarray(dnorm.reshape(28, 28) * 255.).convert('L')
                    
                    # apply preprocessor
                    if preprocessor == 'rotate':
                        rot = rng.randint(-40, 41)
                        img = img.rotate(rot, Image.BILINEAR)
                    elif preprocessor == 'emboss':
                        img = emboss(img)
                    elif preprocessor == 'hshear':
                        # coef = 0 means unsheared
                        coef = -1 + np.random.rand()*2
                        # note: image is moved with (coef/2)*28 to center it after shearing
                        img = img.transform((28,28), Image.AFFINE, (1,coef,-(coef/2)*28,0,1,0), Image.BILINEAR)
                    elif preprocessor == 'vshear':
                        coef = -1 + np.random.rand()*2
                        img = img.transform((28,28), Image.AFFINE, (1,0,0,coef,1,-(coef/2)*28), Image.BILINEAR)
                    elif preprocessor == 'patch':
                        # negative values are not possible in PIL, so do a zoom only transform then
                        x1 = np.random.randint(0, 5)
                        y1 = np.random.randint(0, 5)
                        x2 = np.random.randint(0, 5)
                        y2 = np.random.randint(0, 5)
                        img = img.transform((28,28), Image.EXTENT, (x1, y1, 28-x2, 28-y2), Image.BILINEAR)
                    
                    # convert back to numpy array
                    data.X[ii,:] = np.array(img.getdata()) / 255.
                    
                    if preprocessor == 'noisy':
                        # add noise
                        data.X[ii,:] += np.random.randn(28*28) * 0.1
                        # bound between [0,1]
                        data.X[ii,:] = np.minimum(np.ones(28*28), np.maximum(np.zeros(28*28), data.X[ii,:]))
        
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR+'train_'+preprocessor+'_design.npy')
        validset.use_design_loc(DATA_DIR+'valid_'+preprocessor+'_design.npy')
        tottrainset.use_design_loc(DATA_DIR+'tottrain_'+preprocessor+'_design.npy')
        testset.use_design_loc(DATA_DIR+'test_'+preprocessor+'_design.npy')
        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path
        
        print 'saving preprocessed data...'
        serial.save(train_path, trainset)
        serial.save(valid_path, validset)
        serial.save(tottrain_path, tottrainset)
        serial.save(test_path, testset)
        
    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset
Esempio n. 12
0
def get_data(tot=True, flatgrey=False):
    tottrain_path = DATA_DIR+'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl'
    test_path = DATA_DIR+'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl'

    if os.path.exists(test_path):

        print 'loading preprocessed data'
        datasets = OrderedDict()
        # datasets['train'] = serial.load(train_path)
        # datasets['valid'] = serial.load(valid_path)
        if tot:
            datasets['tottrain'] = serial.load(tottrain_path)
        datasets['test'] = serial.load(test_path)
        if tot:
            return datasets['tottrain'], datasets['test']
        else:
            return datasets['train'], datasets['test']
    else:
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(use_std=True))
        pipeline.items.append(preprocessing.ZCA())

        # print 'traindata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=0, stop=39999)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'train_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_train'+str(SUBMODEL) + '.pkl', data)

        # print 'validdata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=40000, stop=61577)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'valid_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_valid'+str(SUBMODEL) + '.pkl', data)

        print 'tottraindata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='training', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'tottrain_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl', data)

        print 'testdata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='test', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'test_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl', data)

        print 'Finished, now re-run for running model on GPU'
        return None, None
Esempio n. 13
0
def get_dataset(which_data, tot=False):
    train_path = DATA_DIR + 'train' + which_data + '_preprocessed.pkl'
    valid_path = DATA_DIR + 'valid' + which_data + '_preprocessed.pkl'
    tottrain_path = DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl'
    test_path = DATA_DIR + 'test' + which_data + '_preprocessed.pkl'

    if os.path.exists(train_path) and os.path.exists(
            valid_path) and os.path.exists(test_path):

        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:

        print 'loading raw data...'
        trainset = Whales(which_set="train",
                          which_data=which_data,
                          start=0,
                          stop=56671)
        validset = Whales(which_set="train",
                          which_data=which_data,
                          start=56671,
                          stop=66671)
        tottrainset = Whales(which_set="train", which_data=which_data)
        testset = Whales(which_set="test", which_data=which_data)

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        if which_data == 'melspectrum':
            pipeline.items.append(
                preprocessing.Standardize(global_mean=True, global_std=True))
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        else:
            # global_mean/std=False voor per-feature standardization
            pipeline.items.append(
                preprocessing.Standardize(global_mean=False, global_std=False))

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR + 'train_' + which_data +
                                '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        validset.use_design_loc(DATA_DIR + 'valid_' + which_data +
                                '_design.npy')
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        tottrainset.use_design_loc(DATA_DIR + 'tottrain_' + which_data +
                                   '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        testset.use_design_loc(DATA_DIR + 'test_' + which_data + '_design.npy')

        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path

        print 'saving preprocessed data...'
        serial.save(DATA_DIR + 'train' + which_data + '_preprocessed.pkl',
                    trainset)
        serial.save(DATA_DIR + 'valid' + which_data + '_preprocessed.pkl',
                    validset)
        serial.save(DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl',
                    tottrainset)
        serial.save(DATA_DIR + 'test' + which_data + '_preprocessed.pkl',
                    testset)

    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset
Esempio n. 14
0
from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.datasets.tfd import TFD

train = TFD(which_set='train')

preprocessor = preprocessing.Pipeline()
preprocessor.items.append(preprocessing.GlobalContrastNormalization())
preprocessor.items.append(preprocessing.ZCA())

preprocessor.apply(train, can_fit=True)

serial.save('tfd_gcn_whitener.pkl', preprocessor)
def main():
    data_name, model, pretrain, test, filter_channel, \
    filter_size, activation, sparse_coeff, is_tied_weights, \
    is_linear, do_zca, do_scale, do_maxpoool, n_epochs, batch_size = ProcessCommandLine()
    print '... Loading data and parameters'
    print 'dataset: ', data_name
    print 'filter_channel=', filter_channel
    print 'filter_size=', filter_size
    print 'activation=', activation
    print 'sparsity coefficient=', sparse_coeff
    print 'Max pooling=', do_maxpoool
    print 'tied weight=', is_tied_weights
    print 'linear CAE=', is_linear
    print 'ZCA whitening=', do_zca
    print 'Scale image=', do_scale
    print 'Batch size=', batch_size
    print 'number of epoch=', n_epochs
    # batch_size = 100                      # number of images in each batch
    n_epochs = 100  # number of experiment epochs
    learning_rate = 0.1  # learning rate of SGD
    # filter_channel = 16                           # number of feature maps in ConvAE
    # dataset = 'data/mnist.pkl.gz'         # address of data
    # rng = np.random.RandomState(23455)  # random generator
    # filter_size = 11
    # n_images = 20
    # sparse_coeff = 1
    if sparse_coeff == 0:
        model_type = 'cae'
    else:
        model_type = 'spcae'

    model_save_name = model_type+'_'+data_name+'_'+ \
                      '[fn=%d,fs=%d,sp=%.3f,maxpool=%s,tied=%s,act=%s,linear=%s,ZCA=%s,scale=%s]' \
                      % (filter_channel, filter_size, sparse_coeff, do_maxpoool, is_tied_weights,
                         activation, is_linear, do_zca, do_scale)
    results_dir = model_save_name + '/'
    # results_dir = data_name+'_'+model_name+'/'
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)

    if data_name == 'smallNORB':
        ### load smallNORB train set ###
        # norb = SmallNORB('train', True)
        # data_x = norb.adjust_for_viewer(norb.X)
        # data_y = norb.y
        # pickle.dump((data_x, data_y),open('smallNORB.pkl','wb'), -1)
        # results_dir = 'smallNORB_scae/'
        # results_dir = 'smallNORB_'+model_name+'/'
        # if not os.path.isdir(results_dir):
        #     os.mkdir(results_dir)
        # f = open('smallNORB.pkl', 'r')
        # data, data_y = pickle.load(f)
        # _, feat = data.shape
        # f.close()
        # train = NORB(which_norb='small', which_set='train')
        # ipdb.set_trace()
        # window = preprocessing.CentralWindow(window_shape=(64,64))
        # train.apply_preprocessor(preprocessor=window)
        # train.X = train.X.astype('float32')
        # zca = preprocessing.ZCA()
        # train.apply_preprocessor(preprocessor=zca, can_fit=True)
        # _, feat = train.X.shape
        # data_x = train.X[:, :feat/2]
        norb = sio.loadmat('smallNORB_matlab/smallNORB_train_32x32.mat')
        train_x = norb['trainData'].transpose(1, 0)
        if do_zca:
            zca = zca_whitening.ZCA()
            zca.fit(train_x)
            train_x = zca.transform(train_x)
        if do_scale:
            min_max_scaler = MinMaxScaler()
            train_x_T = min_max_scaler.fit_transform(train_x.T)
            train_x = train_x_T.T
        data_x = train_x
        idx = random.shuffle(range(data_x.shape[0]))
        data_x = data_x[idx, :][0, :, :]
        # ipdb.set_trace()
        im_channel = 1
        im_height = 32
        im_width = 32

    elif data_name == 'cifar':
        ### load cifar10 ###
        # results_dir = 'cifar'+model_name+'/'
        # data_x = pickle.load(open('cifar10.pkl', 'r'))
        train = CIFAR10('train', gcn=55.)
        im_channel = 3
        im_height = 32
        im_width = 32
        # min_max_scaler = MinMaxScaler()
        # data_x = min_max_scaler.fit_transform(data_x)
        # data_x = cifar10.X

    # elif data_name == 'cifarw':
    #     ### load cifar10 ###
    #     # results_dir = 'cifar'+model_name+'/'
    #     # data_x = pickle.load(open('cifar10_whitened.pkl', 'r'))
    #     train = CIFAR10('train', gcn=55.)
    #     # zca = preprocessing.ZCA()
    #     # cifar10.apply_preprocessor(preprocessor=zca, can_fit=True)
    #     im_channel = 3
    #     im_height = 32
    #     im_width = 32
    # min_max_scaler = MinMaxScaler()
    # data_x = min_max_scaler.fit_transform(cifar10.X)
    # data_x = cifar10.X
    # ipdb.set_trace()

    elif data_name == 'svhn':
        f = open('svhn.pkl', 'r')
        svhn = pickle.load(f)
        f.close()
        im_channel = 3
        im_height = 32
        im_width = 32
        train_x, train_y = svhn[0]
        test_x, test_y = svhn[1]
        extra_x, extra_y = svhn[2]
        train_x = train_x.transpose([3, 2, 0, 1])
        test_x = test_x.transpose([3, 2, 0, 1])
        extra_x = extra_x.transpose([3, 2, 0, 1])
        # Scale to [0,1]
        channel_scale_factor = train_x.max(axis=(2, 3)).astype('float32')
        train_x_scaled = train_x / channel_scale_factor.reshape(
            channel_scale_factor.shape[0], im_channel, 1, 1)
        data_x = train_x_scaled.reshape(train_x.shape[0],
                                        im_channel * im_height * im_width)

    elif data_name == 'mnist':
        # f = open('mnist.pkl', 'r')
        # mnist = pickle.load(f)
        # f.close()
        # train_x, train_y = mnist[0]
        # valid_x, valid_y = mnist[1]
        # test_x, test_y = mnist[2]
        # data_x = train_x
        train = MNIST('train')
        # zca = preprocessing.ZCA()
        # train.apply_preprocessor(preprocessor=zca, can_fit=True)
        # ipdb.set_trace()
        # min_max_scaler = MinMaxScaler()
        # data_x = min_max_scaler.fit_transform(train.X)
        # data_x = train.X
        # data_y = train.y
        im_channel = 1
        im_height = 28
        im_width = 28

    elif data_name == 'bmnist':
        train = BinarizedMNIST(which_set='train')
        # zca = preprocessing.ZCA()
        # train.apply_preprocessor(preprocessor=zca, can_fit=True)
        # ipdb.set_trace()
        # min_max_scaler = MinMaxScaler()
        # data_x = min_max_scaler.fit_transform(train.X)
        # data_x = train.X
        # data_y = train.y
        im_channel = 1
        im_height = 28
        im_width = 28

    if do_zca and data_name not in ['smallNORB', 'svhn']:
        zca = preprocessing.ZCA()
        train.apply_preprocessor(preprocessor=zca, can_fit=True)
        data_x = train.X
        pass
    if do_scale and data_name not in ['smallNORB', 'svhn']:
        min_max_scaler = MinMaxScaler()
        data_x_T = min_max_scaler.fit_transform(train.X.T)
        data_x = data_x_T.T
        pass
    if not do_zca and not do_scale and data_name not in ['smallNORB', 'svhn']:
        data_x = train.X
    # if data_name not in ['smallNORB']:
    #     data_x = train.X

    n_samples, n_feat = data_x.shape
    data_x = data_x.reshape((n_samples, im_channel, im_height, im_width))
    if data_name == 'mnist':
        data_x = data_x.transpose(0, 1, 3, 2)
    train_set_x = theano.shared(np.asarray(data_x, dtype=np.float32),
                                borrow=True)

    # image_shp = (batch_size, im_channel, data_x.shape[2], data_x.shape[3])
    image_shp = (batch_size, im_channel, im_height, im_width)
    filter_shp = (filter_channel, im_channel, filter_size, filter_size)

    print 'building model'
    cae1 = CAE(image_shape=image_shp,
               data=train_set_x,
               filter_shape=filter_shp,
               poolsize=(2, 2),
               sparse_coeff=sparse_coeff,
               activation=activation,
               do_max_pool=do_maxpoool,
               tied_weight=is_tied_weights,
               is_linear=is_linear)
    print 'model built'
    sys.stdout.flush()

    if model:
        cae1.load(model)
        pass

    if pretrain:
        do_pretraining_cae(data_name=data_name,
                           model=cae1,
                           save_name=model_save_name,
                           image_shape=image_shp,
                           result_dir=results_dir,
                           max_epoch=n_epochs)
    elif test:
        do_visualize(data_name=data_name, model=cae1, result_dir=results_dir)