Esempio n. 1
0
def get_processed_dataset():

    train_path = 'pp_cifar10_train.pkl'
    test_path = 'pp_cifar10_test.pkl'

    if os.path.exists(train_path) and os.path.exists(
            test_path) and not new_params:
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)

    else:
        print 'loading raw data...'

        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.ExtractPatchesWithPosition(
                patch_shape=patch_shape, patches_per_image=patches_per_image))
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                      use_std=True))
        pipeline.items.append(
            preprocessing.PCA(num_components=num_components,
                              keep_var_fraction=keep_var_fraction))
        pipeline.items.append(
            preprocessing.ExtractPatchPairs(
                patches_per_image=patches_per_image,
                num_images=train_size,
                input_width=input_width))

        trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop)
        testset = cifar10.CIFAR10(which_set="test")

        trainset.preprocessor = pipeline

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)

        # the pkl-ing is having issues, the dataset is maybe too big.
        serial.save(train_path, trainset)
        serial.save(test_path, testset)

        # this path will be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    return trainset, testset
Esempio n. 2
0
def main():
    train = cifar10.CIFAR10(which_set="train", center=True)

    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                                  sqrt_bias=0.0,
                                                  use_std=True))
    pipeline.items.append(preprocessing.PCA(num_components=512))

    test = cifar10.CIFAR10(which_set="test")

    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
    test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

    serial.save('cifar10_preprocessed_train.pkl', train)
    serial.save('cifar10_preprocessed_test.pkl', test)
Esempio n. 3
0
    # First we want to pull out small patches of the images, since it's easier
    # to train an RBM on these
    pipeline.items.append(
        preprocessing.ExtractPatchesWithPosition(patch_shape=(patch_width, patch_width), patches_per_image = patches_per_image)
    )

    # Next we contrast normalize the patches. The default arguments use the
    # same "regularization" parameters as those used in Adam Coates, Honglak
    # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in
    # Unsupervised Feature Learning"
    pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))

    # Finally we whiten the data using ZCA. Again, the default parameters to
    # ZCA are set to the same values as those used in the previously mentioned
    # paper.
    pipeline.items.append(preprocessing.PCA(num_components = num_components, keep_var_fraction = keep_var_fraction))

    pipeline.items.append(preprocessing.ExtractPatchPairs(patches_per_image = patches_per_image, num_images = num_images)

    # Here we apply the preprocessing pipeline to the dataset. The can_fit
    # argument indicates that data-driven preprocessing steps (such as the ZCA
    # step in this example) are allowed to fit themselves to this dataset.
    # Later we might want to run the same pipeline on the test set with the
    # can_fit flag set to False, in order to make sure that the same whitening
    # matrix was used on both datasets.
    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    # Finally we save the dataset to the filesystem. We instruct the dataset to
    # store its design matrix as a numpy file because this uses less memory
    # when re-loading (Pickle files, in general, use double their actual size
    # in the process of being re-loaded into a running process).
Esempio n. 4
0
    # to train an RBM on these
    pipeline.items.append(
        preprocessing.ExtractPatchesWithPosition(patch_shape=(8, 8),
                                                 patches_per_image=3))

    # Next we contrast normalize the patches. The default arguments use the
    # same "regularization" parameters as those used in Adam Coates, Honglak
    # Lee, and Andrew Ng's paper "An Analysis of Single-Layer Networks in
    # Unsupervised Feature Learning"
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))

    # Finally we whiten the data using ZCA. Again, the default parameters to
    # ZCA are set to the same values as those used in the previously mentioned
    # paper.
    pipeline.items.append(preprocessing.PCA(keep_var_fraction=.99))

    # Here we apply the preprocessing pipeline to the dataset. The can_fit
    # argument indicates that data-driven preprocessing steps (such as the ZCA
    # step in this example) are allowed to fit themselves to this dataset.
    # Later we might want to run the same pipeline on the test set with the
    # can_fit flag set to False, in order to make sure that the same whitening
    # matrix was used on both datasets.
    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    # Finally we save the dataset to the filesystem. We instruct the dataset to
    # store its design matrix as a numpy file because this uses less memory
    # when re-loading (Pickle files, in general, use double their actual size
    # in the process of being re-loaded into a running process).
    # The dataset object itself is stored as a pickle file.
    train.use_design_loc('train_design.npy')
Esempio n. 5
0
#replicate the preprocessing described in Kai Yu's paper Improving LCC with Local Tangents
from pylearn2.utils import serial
from pylearn2.datasets import cifar10
from pylearn2.datasets import preprocessing

train = cifar10.CIFAR10(which_set="train", center=True)

pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                              sqrt_bias=0.0,
                                              use_std=True))
pipeline.items.append(preprocessing.PCA(num_components=512))

test = cifar10.CIFAR10(which_set="test")

train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

serial.save('cifar10_preprocessed_train.pkl', train)
serial.save('cifar10_preprocessed_test.pkl', test)