def preprocess_uwash_depth_dataset(attribs):

    pipeline = preprocessing.Pipeline()

    # rgbd : (num_examples, 72, 72, 4)
    # labels: (num_examples, 1)
    pipeline.items.append(
        hdf5_data_preprocessors.ExtractRawUWashData(
            attribs["raw_data_folder"],
            data_labels=("rgbd_patches", "patch_labels")))

    pipeline.items.append(
        hdf5_data_preprocessors.PerChannelGlobalContrastNormalizePatches(
            data_to_normalize_key='rgbd_patches',
            normalized_data_key='normalized_rgbd_patches',
            batch_size=100))

    #this extracts a valid set and test set
    pipeline.items.append(
        hdf5_data_preprocessors.SplitData(
            data_to_split_key=('rgbd_patches', 'patch_labels'),
            sets=attribs["sets"],
            patch_shape=attribs["patch_shape"],
            num_patches_per_set=attribs["num_patches_per_set"]))

    pipeline.items.append(hdf5_data_preprocessors.MakeC01B())

    #now lets actually make a new dataset and run it through the pipeline
    hd5f_dataset = h5py.File(attribs["output_filepath"])
    pipeline.apply(hd5f_dataset)
def preprocess_nyu_depth_dataset(attribs):

    pipeline = preprocessing.Pipeline()

    # rgbd : (1449, 640, 480, 4)
    # labels: (1449, 640, 480)
    pipeline.items.append(
        hdf5_data_preprocessors.ExtractRawNYUData(attribs["raw_filepath"],
                                                  data_labels=("rgbd",
                                                               "labels")))

    #add the steps necessary to generate data for
    # valid, test and training datasets
    for i in range(len(attribs["sets"])):

        which_set = attribs["sets"][i]
        num_patches = attribs["num_patches_per_set"][i]

        #labels for the hdf5 file
        patch_label = which_set + "_patches"
        patch_labels = (patch_label, which_set + "_patch_labels")

        pipeline.items.append(
            hdf5_data_preprocessors.ExtractPatches(
                patch_shape=attribs["patch_shape"],
                patch_labels=patch_labels,
                patch_source_labels=("rgbd", "labels"),
                num_patches=num_patches))

    pipeline.items.append(hdf5_data_preprocessors.MakeC01B())

    #now lets actually make a new dataset and run it through the pipeline
    hd5f_dataset = h5py.File(attribs["output_filepath"])
    pipeline.apply(hd5f_dataset)
Example #3
0
def test1():
    path_org = '/Tmp/gulcehrc/imagenet_256x256_filtered.h5'
    path = '/Tmp/gulcehrc/imagenetTemp.h5'
    train = Imagenet(which_set='train',
                     path=path,
                     path_org=path_org,
                     size_of_receptive_field=(8, 8),
                     center=True,
                     scale=True,
                     start=0,
                     stop=1000,
                     imageShape=(256, 256),
                     mode='a',
                     axes=('b', 0, 1, 'c'),
                     preprocessor=None)

    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.GlobalContrastNormalizationPyTables())
    pipeline.items.append(
        preprocessing.LeCunLCN((256, 256), channels=[0], kernel_size=7))

    # apply preprocessing to train
    train.apply_preprocessor(pipeline, can_fit=True)
    train.view_shape()

    #testing
    batch_size = 10
    num_batches = 1
    mode = SequentialSubsetIterator
    targets1 = []
    targets2 = []
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL10-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))
    del supplement

    print("Preparing output directory...")
    patch_dir = data_dir + '/stl10_patches_8x8'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from a downsampled (to 32x32)
    version of the STL-10 train and unlabeled datasets.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_stl10_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8),
                          num_patches=2*1000*1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Example #5
0
    def randomize_datasets(self, datasets):
        center_shift = np.array(self._window_shape) / 2. - 0.5
        tform_center = skimage.transform.SimilarityTransform(
            translation=-center_shift)
        tform_uncenter = skimage.transform.SimilarityTransform(
            translation=center_shift)
        if self._preprocess is not None:
            pipeline = preprocessing.Pipeline()
            #window the rotations to get rid of the uniform background
            if self._central_window_shape is not None:
                print 'adding window'
                pipeline.items.append(CentralWindow(
                    self._central_window_shape))

            for item in self._preprocess:
                pipeline.items.append(item)

        im_shape = (self._window_shape[0], self._window_shape[1], 1)

        for d_idx, dataset in enumerate(datasets):

            data = self._original[dataset]
            #randomly window data
            print data.shape
            arr = np.empty((data.shape[0], self._window_shape[0],
                            self._window_shape[1], data.shape[3]),
                           dtype=np.float32)
            for idx, example in enumerate(data):
                scale_x = np.random.uniform(1 - self._scale_diff,
                                            1 + self._scale_diff)
                scale_y = np.random.uniform(1 - self._scale_diff,
                                            1 + self._scale_diff)
                translation_x = np.random.uniform(1 - self._translation,
                                                  1 + self._translation)
                translation_y = np.random.uniform(1 - self._translation,
                                                  1 + self._translation)
                shear = np.random.uniform(0. - self._shear, 0. + self._shear)
                rotation = np.random.uniform(0, 360)
                tform = AffineTransform(scale=(scale_x, scale_y),
                                        rotation=np.deg2rad(rotation),
                                        translation=(translation_x,
                                                     translation_y),
                                        shear=shear)
                tform = tform_center + tform + tform_uncenter
                img = warp(example, tform, output_shape=self._window_shape)
                arr[idx] = img

            dataset.set_topological_view(arr, axes=dataset.view_converter.axes)
            #assumes self._randomize in in order of [train, valid/test]
            if self._preprocess is not None:
                can_fit = True
                if d_idx == 1:
                    can_fit = False

                dataset.apply_preprocessor(preprocessor=pipeline,
                                           can_fit=can_fit)
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')

    print('Loading CIFAR-100 train dataset...')
    data = CIFAR100(which_set='train')

    print("Preparing output directory...")
    patch_dir = data_dir + '/cifar100/cifar100_patches'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from the CIFAR-100 train set.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_cifar100_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.ExtractPatches(patch_shape=(6, 6),
                                     num_patches=2 * 1000 * 1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Example #7
0
def get_pipeline(img_shape, patch_size, batch_size):
    pipeline = preprocessing.Pipeline()
    conf = get_config()
    if conf['preprocessing']['remove_mean']:
        pipeline.items.append(preprocessing.RemoveMean())
    if conf['preprocessing']['gcn']:
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(batch_size=batch_size)
        )
    if conf['preprocessing']['lcn']:
        # LCN requires uneven patch size
        lcn_patch_size = patch_size + 1 - (patch_size % 2)
        pipeline.items.append(
            preprocessing.LeCunLCN(
                img_shape, kernel_size=lcn_patch_size)
        )
    return pipeline
Example #8
0
def get_processed_dataset():

    train_path = 'pp_cifar10_train.pkl'
    test_path = 'pp_cifar10_test.pkl'

    if os.path.exists(train_path) and os.path.exists(
            test_path) and not new_params:
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)

    else:
        print 'loading raw data...'

        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.ExtractPatchesWithPosition(
                patch_shape=patch_shape, patches_per_image=patches_per_image))
        pipeline.items.append(
            preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                      use_std=True))
        pipeline.items.append(
            preprocessing.PCA(num_components=num_components,
                              keep_var_fraction=keep_var_fraction))
        pipeline.items.append(
            preprocessing.ExtractPatchPairs(
                patches_per_image=patches_per_image,
                num_images=train_size,
                input_width=input_width))

        trainset = cifar10.CIFAR10(which_set="train", start=start, stop=stop)
        testset = cifar10.CIFAR10(which_set="test")

        trainset.preprocessor = pipeline

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)

        # the pkl-ing is having issues, the dataset is maybe too big.
        serial.save(train_path, trainset)
        serial.save(test_path, testset)

        # this path will be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    return trainset, testset
Example #9
0
def main():
    train = cifar10.CIFAR10(which_set="train", center=True)

    pipeline = preprocessing.Pipeline()
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(subtract_mean=False,
                                                  sqrt_bias=0.0,
                                                  use_std=True))
    pipeline.items.append(preprocessing.PCA(num_components=512))

    test = cifar10.CIFAR10(which_set="test")

    train.apply_preprocessor(preprocessor=pipeline, can_fit=True)
    test.apply_preprocessor(preprocessor=pipeline, can_fit=False)

    serial.save('cifar10_preprocessed_train.pkl', train)
    serial.save('cifar10_preprocessed_test.pkl', test)
def get_dataset_cifar10():
    """
    The orginal pipeline on cifar10 from pylearn2. Please refer to
    pylearn2/scripts/train_example/make_dataset.py for details.
    """

    train_path = 'cifar10_preprocessed_train.pkl'
    test_path = 'cifar10_preprocessed_test.pkl'

    if os.path.exists(train_path) and \
            os.path.exists(test_path):
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        testset = serial.load(test_path)
    else:
        print 'loading raw data...'
        trainset = cifar10.CIFAR10(w5B5B5B5Bhich_set="train")
        testset =  cifar10.CIFAR10(which_set="test")

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        pipeline.items.append(
            preprocessing.ExtractPatches(patch_shape=(8, 8), num_patches=150000))

        pipeline.items.append(preprocessing.GlobalContrastNormalization())

        pipeline.items.append(preprocessing.ZCA())

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        trainset.use_design_loc('train_design.npy')

        testset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.use_design_loc('test_design.npy')

        print 'saving preprocessed data...'
        serial.save('cifar10_preprocessed_train.pkl', trainset)
        serial.save('cifar10_preprocessed_test.pkl', testset)

        trainset.yaml_src = '!pkl: "%s"' % train_path
        testset.yaml_src = '!pkl: "%s"' % test_path

    # this path will be used for visualizing weights after training is done
    #global YAML
    return trainset, testset
Example #11
0
def process_data():
    # pre-process unsupervised data
    if not os.path.exists(DATA_DIR+'preprocess.pkl') \
    or not os.path.exists(DATA_DIR+'unsup_prep_data.pkl') \
    or not os.path.exists(DATA_DIR+'sup_prep_data.pkl'):
        unsup_data = black_box_dataset.BlackBoxDataset('extra')
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.Standardize(global_mean=False, global_std=False))
        #pipeline.items.append(preprocessing.ZCA(filter_bias=.1))
        unsup_data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        serial.save(DATA_DIR + 'preprocess.pkl', pipeline)

        # why the hell do I get pickling errors if I use serial here? solve by pickling myself
        #serial.save(DATA_DIR+'unsup_prep_data.pkl', unsup_data)
        out = open(DATA_DIR + 'unsup_prep_data.pkl', 'w')
        pickle.dump(unsup_data, out)
        out.close()

        # process supervised training data
        sup_data = []
        which_data = ['train'] * 3 + ['public_test']
        starts = [0, 800, None, None]
        stops = [800, 1000, None, None]
        fits = [False, False, False, False]
        for curstr, start, stop, fit in zip(which_data, starts, stops, fits):
            sup_data.append(
                black_box_dataset.BlackBoxDataset(which_set=curstr,
                                                  start=start,
                                                  stop=stop,
                                                  preprocessor=pipeline,
                                                  fit_preprocessor=fit))
        serial.save(DATA_DIR + 'sup_prep_data.pkl', sup_data)

    else:
        pipeline = serial.load(DATA_DIR + 'preprocess.pkl')
        #unsup_data = serial.load(DATA_DIR+'unsup_prep_data.pkl')
        unsup_data = pickle.load(open(DATA_DIR + 'unsup_prep_data.pkl', 'r'))
        sup_data = serial.load(DATA_DIR + 'sup_prep_data.pkl')

    return unsup_data, sup_data
Example #12
0
def generate_patches():
    datasets = OrderedDict()
    datasets['train'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=1,
                                                  stop=201)
    datasets['valid'] = GenderWrite.gwdata.GWData(which_set='train',
                                                  start=201,
                                                  stop=283)
    datasets['test'] = GenderWrite.gwdata.GWData(which_set='test')
    datasets['tottrain'] = GenderWrite.gwdata.GWData(which_set='train')

    # preprocess patches
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.GlobalContrastNormalization())
    pipeline.items.append(preprocessing.ZCA())
    for dstr, dset in datasets.iteritems():
        print dstr
        # only fit on train data
        trainbool = dstr == 'train' or dstr == 'tottrain'
        dset.apply_preprocessor(preprocessor=pipeline, can_fit=trainbool)
        # save
        dset.use_design_loc(DATA_DIR + dstr + '_design.npy')
        serial.save(DATA_DIR + 'gw_preprocessed_' + dstr + '.pkl', dset)
Example #13
0
    def load_preprocessor(self, preprocess_array):
        if preprocess_array is None:
            self.preprocessor = None
            return None
        preprocess_list = []
        for preprocess_id in preprocess_array:
            row = self.db.executeSQL(
                """
            SELECT preprocess_class
            FROM hps3.preprocess
            WHERE preprocess_id = %s
            """, (preprocess_id, ), self.db.FETCH_ONE)
            if not row or row is None:
                raise HPSData("No preprocess for preprocess_id="\
                    +str(preprocess_id))
            preprocess_class = row[0]
            fn = getattr(self, 'get_preprocess_' + preprocess_class)
            preprocess_list.append(fn(preprocess_id))

        if len(preprocess_list) > 1:
            preprocessor = pp.Pipeline(preprocess_list)
        else:
            preprocessor = preprocess_list[0]
        self.preprocessor = preprocessor
Example #14
0
def get_dataset(which_data, tot=False):
    train_path = DATA_DIR + 'train' + which_data + '_preprocessed.pkl'
    valid_path = DATA_DIR + 'valid' + which_data + '_preprocessed.pkl'
    tottrain_path = DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl'
    test_path = DATA_DIR + 'test' + which_data + '_preprocessed.pkl'

    if os.path.exists(train_path) and os.path.exists(
            valid_path) and os.path.exists(test_path):

        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:

        print 'loading raw data...'
        trainset = Whales(which_set="train",
                          which_data=which_data,
                          start=0,
                          stop=56671)
        validset = Whales(which_set="train",
                          which_data=which_data,
                          start=56671,
                          stop=66671)
        tottrainset = Whales(which_set="train", which_data=which_data)
        testset = Whales(which_set="test", which_data=which_data)

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        if which_data == 'melspectrum':
            pipeline.items.append(
                preprocessing.Standardize(global_mean=True, global_std=True))
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        else:
            # global_mean/std=False voor per-feature standardization
            pipeline.items.append(
                preprocessing.Standardize(global_mean=False, global_std=False))

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR + 'train_' + which_data +
                                '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        validset.use_design_loc(DATA_DIR + 'valid_' + which_data +
                                '_design.npy')
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        tottrainset.use_design_loc(DATA_DIR + 'tottrain_' + which_data +
                                   '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        testset.use_design_loc(DATA_DIR + 'test_' + which_data + '_design.npy')

        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path

        print 'saving preprocessed data...'
        serial.save(DATA_DIR + 'train' + which_data + '_preprocessed.pkl',
                    trainset)
        serial.save(DATA_DIR + 'valid' + which_data + '_preprocessed.pkl',
                    validset)
        serial.save(DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl',
                    tottrainset)
        serial.save(DATA_DIR + 'test' + which_data + '_preprocessed.pkl',
                    testset)

    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset
Example #15
0
from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.datasets.tfd import TFD

train = TFD(which_set='train')

preprocessor = preprocessing.Pipeline()
preprocessor.items.append(preprocessing.GlobalContrastNormalization())
preprocessor.items.append(preprocessing.ZCA())

preprocessor.apply(train, can_fit=True)

serial.save('tfd_gcn_whitener.pkl', preprocessor)
Example #16
0
def generate(opc):
    """
    Summary (Generates a dataset with the chosen transformation).

    Parameters
    ----------
    opc: string
        Only two options, shifts or rotations.
    """
    dim = 19  # outer square
    # A bigger image is used to avoid empty pixels in the
    # borders.
    reg = 13  # inner square
    total = 20000  # Number of training examples

    im1 = numpy.zeros((total, reg, reg, 1), dtype='float32')
    im2 = numpy.zeros((total, reg, reg, 1), dtype='float32')
    Y = numpy.zeros((total, 1), dtype='uint8')
    rng = make_np_rng(9001, [1, 2, 3], which_method="uniform")
    transformation = opc

    if transformation == 'shifts':
        # Shifts
        # only shifts between [-3, +3] pixels
        shifts = list(itertools.product(range(-3, 4), range(-3, 4)))
        t = 0
        while t < total:
            x = rng.uniform(0, 1, (dim, dim))
            x = numpy.ceil(x * 255)
            im_x = x[3:16, 3:16][:, :, None]
            ind = rng.randint(0, len(shifts))
            Y[t] = ind
            txy = shifts[ind]
            tx, ty = txy
            im_y = x[(3 + tx):(16 + tx), (3 + ty):(16 + ty)][:, :, None]
            im1[t, :] = im_x
            im2[t, :] = im_y
            t += 1
    else:
        assert transformation == 'rotations'
        # Rotations
        import Image
        # import cv2
        angs = numpy.linspace(0, 359, 90)
        t = 0
        while t < total:
            x = rng.uniform(0, 1, (dim, dim))
            x = numpy.ceil(x * 255)
            im_x = x[3:16, 3:16][:, :, None]
            ind = rng.randint(0, len(angs))
            Y[t] = ind
            ang = angs[ind]
            y = numpy.asarray(Image.fromarray(x).rotate(ang))
            # scale = 1
            # M1 = cv2.getRotationMatrix2D((dim/2, dim/2), ang, scale)
            # y = cv2.warpAffine(x, M1, (dim, dim))
            im_y = y[3:16, 3:16][:, :, None]
            im1[t, :] = im_x
            im2[t, :] = im_y
            t += 1

    view_converter = dense_design_matrix.DefaultViewConverter((reg, reg, 1))

    design_X = view_converter.topo_view_to_design_mat(im1)
    design_Y = view_converter.topo_view_to_design_mat(im2)

    # Normalize data:
    pipeline = preprocessing.Pipeline()
    gcn = preprocessing.GlobalContrastNormalization(sqrt_bias=10.,
                                                    use_std=True)
    pipeline.items.append(gcn)
    XY = numpy.concatenate((design_X, design_Y), 0)
    XY_ImP = dense_design_matrix.DenseDesignMatrix(X=XY)
    XY_ImP.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    X1 = XY_ImP.X[0:design_X.shape[0], :]
    X2 = XY_ImP.X[design_X.shape[0]:, :]

    # As a Conv2DSpace
    topo_X1 = view_converter.design_mat_to_topo_view(X1)
    topo_X2 = view_converter.design_mat_to_topo_view(X2)
    axes = ('b', 0, 1, 'c')
    data_specs = (CompositeSpace([
        Conv2DSpace((reg, reg), num_channels=1, axes=axes),
        Conv2DSpace((reg, reg), num_channels=1, axes=axes),
        VectorSpace(1)
    ]), ('featuresX', 'featuresY', 'targets'))
    train = VectorSpacesDataset((topo_X1, topo_X2, Y), data_specs=data_specs)

    # As a VectorSpace
    # data_specs = (CompositeSpace(
    # [VectorSpace(reg * reg),
    # VectorSpace(reg * reg),
    #      VectorSpace(1)]),
    #               ('featuresX', 'featuresY', 'targets'))
    # train = VectorSpacesDataset(data=(X1, X2, Y), data_specs=data_specs)

    import os

    save_path = os.path.dirname(os.path.realpath(__file__))
    serial.save(os.path.join(save_path, 'train_preprocessed.pkl'), train)
Example #17
0
def get_dataset(tot=False, preprocessor='normal'):
    if not os.path.exists(DATA_DIR+'train.npy') or \
        not os.path.exists(DATA_DIR+'test.npy') or \
        not os.path.exists(DATA_DIR+'targets.npy'):
        initial_read()
    
    train_path = DATA_DIR+'train_'+preprocessor+'_preprocessed.pkl'
    valid_path = DATA_DIR+'valid_'+preprocessor+'_preprocessed.pkl'
    tottrain_path = DATA_DIR+'tottrain_'+preprocessor+'_preprocessed.pkl'
    test_path = DATA_DIR+'test_'+preprocessor+'_preprocessed.pkl'
    
    if os.path.exists(train_path) and os.path.exists(valid_path) and os.path.exists(test_path):
        
        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:
        
        print 'loading raw data...'
        trainset = Digits(which_set='train', start=0, stop=34000)
        validset = Digits(which_set='train', start=34000, stop=42000)
        tottrainset = Digits(which_set='train')
        testset = Digits(which_set='test')
        
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
        
        if preprocessor != 'nozca':
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        
        # note the can_fit=False's: no sharing between train and valid data
        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        
        if preprocessor not in ('normal','nozca'):
            for data in (trainset, validset, tottrainset, testset):
                for ii in range(data.X.shape[0]):
                    # normalize to [0,1]
                    dmax = np.max(data.X[ii,:])
                    dmin = np.min(data.X[ii,:])
                    dnorm = (data.X[ii,:] - dmin) / (dmax - dmin)
                    # and convert to PIL image
                    img = Image.fromarray(dnorm.reshape(28, 28) * 255.).convert('L')
                    
                    # apply preprocessor
                    if preprocessor == 'rotate':
                        rot = rng.randint(-40, 41)
                        img = img.rotate(rot, Image.BILINEAR)
                    elif preprocessor == 'emboss':
                        img = emboss(img)
                    elif preprocessor == 'hshear':
                        # coef = 0 means unsheared
                        coef = -1 + np.random.rand()*2
                        # note: image is moved with (coef/2)*28 to center it after shearing
                        img = img.transform((28,28), Image.AFFINE, (1,coef,-(coef/2)*28,0,1,0), Image.BILINEAR)
                    elif preprocessor == 'vshear':
                        coef = -1 + np.random.rand()*2
                        img = img.transform((28,28), Image.AFFINE, (1,0,0,coef,1,-(coef/2)*28), Image.BILINEAR)
                    elif preprocessor == 'patch':
                        # negative values are not possible in PIL, so do a zoom only transform then
                        x1 = np.random.randint(0, 5)
                        y1 = np.random.randint(0, 5)
                        x2 = np.random.randint(0, 5)
                        y2 = np.random.randint(0, 5)
                        img = img.transform((28,28), Image.EXTENT, (x1, y1, 28-x2, 28-y2), Image.BILINEAR)
                    
                    # convert back to numpy array
                    data.X[ii,:] = np.array(img.getdata()) / 255.
                    
                    if preprocessor == 'noisy':
                        # add noise
                        data.X[ii,:] += np.random.randn(28*28) * 0.1
                        # bound between [0,1]
                        data.X[ii,:] = np.minimum(np.ones(28*28), np.maximum(np.zeros(28*28), data.X[ii,:]))
        
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR+'train_'+preprocessor+'_design.npy')
        validset.use_design_loc(DATA_DIR+'valid_'+preprocessor+'_design.npy')
        tottrainset.use_design_loc(DATA_DIR+'tottrain_'+preprocessor+'_design.npy')
        testset.use_design_loc(DATA_DIR+'test_'+preprocessor+'_design.npy')
        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path
        
        print 'saving preprocessed data...'
        serial.save(train_path, trainset)
        serial.save(valid_path, validset)
        serial.save(tottrain_path, tottrainset)
        serial.save(test_path, testset)
        
    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset
def test_works():
    load = True
    if load == False:
        ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000)
        ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049)
        # valid can_fit = false
        pipeline = preprocessing.Pipeline()
        stndrdz = preprocessing.Standardize()
        stndrdz.apply(ddmTrain, can_fit=True)
        #doubt, how about can_fit = False?
        stndrdz.apply(ddmValid, can_fit=False)
        GCN = preprocessing.GlobalContrastNormalization()
        GCN.apply(ddmTrain, can_fit=True)
        GCN.apply(ddmValid, can_fit=False)

        pcklFile = open('kpd.pkl', 'wb')
        obj = (ddmTrain, ddmValid)
        pickle.dump(obj, pcklFile)
        pcklFile.close()
        return
    else:
        pcklFile = open('kpd.pkl', 'rb')
        (ddmTrain, ddmValid) = pickle.load(pcklFile)
        pcklFile.close()

    #creating layers
    #2 convolutional rectified layers, border mode valid
    layer1 = ConvRectifiedLinear(layer_name='convRect1',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)
    layer2 = ConvRectifiedLinear(layer_name='convRect2',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)

    # Rectified linear units
    layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3')

    #multisoftmax
    n_groups = 30
    n_classes = 98
    irange = 0
    layer_name = 'multisoftmax'
    layerMS = MultiSoftmax(n_groups=n_groups,
                           irange=0.05,
                           n_classes=n_classes,
                           layer_name=layer_name)

    #setting up MLP
    MLPerc = MLP(batch_size=8,
                 input_space=Conv2DSpace(shape=[96, 96], num_channels=1),
                 layers=[layer1, layer2, layer3, layerMS])

    #mlp_cost
    missing_target_value = -1
    mlp_cost = MLPCost(cost_type='default',
                       missing_target_value=missing_target_value)

    #algorithm

    # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria

    term_crit = MonitorBased(prop_decrease=0.00001,
                             N=30,
                             channel_name='validation_objective')
    kpSGD = KeypointSGD(learning_rate=0.001,
                        init_momentum=0.5,
                        monitoring_dataset={
                            'validation': ddmValid,
                            'training': ddmTrain
                        },
                        batch_size=8,
                        batches_per_iter=750,
                        termination_criterion=term_crit,
                        train_iteration_mode='random_uniform',
                        cost=mlp_cost)

    #train extension
    train_ext = ExponentialDecayOverEpoch(decay_factor=0.998,
                                          min_lr_scale=0.01)
    #train object
    train = Train(dataset=ddmTrain,
                  save_path='kpd_model2.pkl',
                  save_freq=1,
                  model=MLPerc,
                  algorithm=kpSGD,
                  extensions=[
                      train_ext,
                      MonitorBasedSaveBest(channel_name='validation_objective',
                                           save_path='kpd_best.pkl'),
                      MomentumAdjustor(start=1, saturate=20, final_momentum=.9)
                  ])
    train.main_loop()
    train.save()
to extract the patches and approximately whiten / contrast normalize
them. This object is necessary when extracting features for
supervised learning or test set classification, because the
extracted features must be computed using inputs that have been
whitened with the ZCA matrix learned and stored by this Pipeline.

They were created with the pylearn2 script make_cifar100_patches.py.

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
""")

README.close()

print("Preprocessing the data...")
pipeline = preprocessing.Pipeline()
pipeline.items.append(
    preprocessing.ExtractPatches(patch_shape=(8, 8),
                                 num_patches=2 * 1000 * 1000))
pipeline.items.append(
    preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
pipeline.items.append(preprocessing.ZCA())
data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

data.use_design_loc(patch_dir + '/data.npy')

serial.save(patch_dir + '/data.pkl', data)

serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Example #20
0
def get_data(tot=True, flatgrey=False):
    tottrain_path = DATA_DIR+'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl'
    test_path = DATA_DIR+'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl'

    if os.path.exists(test_path):

        print 'loading preprocessed data'
        datasets = OrderedDict()
        # datasets['train'] = serial.load(train_path)
        # datasets['valid'] = serial.load(valid_path)
        if tot:
            datasets['tottrain'] = serial.load(tottrain_path)
        datasets['test'] = serial.load(test_path)
        if tot:
            return datasets['tottrain'], datasets['test']
        else:
            return datasets['train'], datasets['test']
    else:
        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(preprocessing.GlobalContrastNormalization(use_std=True))
        pipeline.items.append(preprocessing.ZCA())

        # print 'traindata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=0, stop=39999)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'train_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_train'+str(SUBMODEL) + '.pkl', data)

        # print 'validdata'
        # data = GalaxyZoo.gzdeepdata.GZData(which_set='training', start=40000, stop=61577)
        # data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # # this path can be used for visualizing weights after training is done
        # data.yaml_src = '!pkl: "%s"' % data
        # # save
        # data.use_design_loc(DATA_DIR+'valid_design' + str(SUBMODEL) + '.npy')
        # serial.save(DATA_DIR+'gz_preprocessed_valid'+str(SUBMODEL) + '.pkl', data)

        print 'tottraindata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='training', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'tottrain_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_tottrain' + str(SUBMODEL) + '_64x.pkl', data)

        print 'testdata'
        data = GalaxyZoo.gzdeepdata.GZData(which_set='test', flatgrey=flatgrey)
        data.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        # this path can be used for visualizing weights after training is done
        data.yaml_src = '!pkl: "%s"' % data
        # save
        data.use_design_loc(DATA_DIR + 'test_design' + str(SUBMODEL) + '_64x.npy')
        serial.save(DATA_DIR + 'gz_preprocessed_test' + str(SUBMODEL) + '_64x.pkl', data)

        print 'Finished, now re-run for running model on GPU'
        return None, None