Beispiel #1
0
 def get_preprocess_standardize(self, preprocess_id):
     row = self.db.executeSQL(
         """
     SELECT global_mean, global_std, std_eps	
     FROM hps3.preprocess_standardize
     WHERE preprocess_id = %s
     """, (preprocess_id, ), self.db.FETCH_ONE)
     if not row or row is None:
         raise HPSData("No standardize preprocess for preprocess_id="\
             +str(preprocess_id))
     (global_mean, global_std, std_eps) = row
     return pp.Standardize(global_mean=global_mean,
                           global_std=global_std,
                           std_eps=std_eps)
Beispiel #2
0
def process_data():
    # pre-process unsupervised data
    if not os.path.exists(DATA_DIR+'preprocess.pkl') \
    or not os.path.exists(DATA_DIR+'unsup_prep_data.pkl') \
    or not os.path.exists(DATA_DIR+'sup_prep_data.pkl'):
        unsup_data = black_box_dataset.BlackBoxDataset('extra')
        pipeline = preprocessing.Pipeline()
        pipeline.items.append(
            preprocessing.Standardize(global_mean=False, global_std=False))
        #pipeline.items.append(preprocessing.ZCA(filter_bias=.1))
        unsup_data.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        serial.save(DATA_DIR + 'preprocess.pkl', pipeline)

        # why the hell do I get pickling errors if I use serial here? solve by pickling myself
        #serial.save(DATA_DIR+'unsup_prep_data.pkl', unsup_data)
        out = open(DATA_DIR + 'unsup_prep_data.pkl', 'w')
        pickle.dump(unsup_data, out)
        out.close()

        # process supervised training data
        sup_data = []
        which_data = ['train'] * 3 + ['public_test']
        starts = [0, 800, None, None]
        stops = [800, 1000, None, None]
        fits = [False, False, False, False]
        for curstr, start, stop, fit in zip(which_data, starts, stops, fits):
            sup_data.append(
                black_box_dataset.BlackBoxDataset(which_set=curstr,
                                                  start=start,
                                                  stop=stop,
                                                  preprocessor=pipeline,
                                                  fit_preprocessor=fit))
        serial.save(DATA_DIR + 'sup_prep_data.pkl', sup_data)

    else:
        pipeline = serial.load(DATA_DIR + 'preprocess.pkl')
        #unsup_data = serial.load(DATA_DIR+'unsup_prep_data.pkl')
        unsup_data = pickle.load(open(DATA_DIR + 'unsup_prep_data.pkl', 'r'))
        sup_data = serial.load(DATA_DIR + 'sup_prep_data.pkl')

    return unsup_data, sup_data
Beispiel #3
0
from pylearn2.utils import serial
from pylearn2.datasets import mike_preprocessing
from pylearn2.datasets import preprocessing
from pylearn2.utils import string_utils
from pylearn2.datasets.mat_data import MATDATA
import datetime

data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/wbc')

output_dir = data_dir + '/augmented1906'
serial.mkdir(output_dir)

preprocessor1 = preprocessing.Standardize(global_std=True)
preprocessor2 = mike_preprocessing.AugmentAndBalance()
preprocessor3 = preprocessing.ShuffleAndSplit(
    seed=datetime.datetime.now().microsecond, start=0, stop=11000)

train = MATDATA(path=data_dir +
                '/classData19062014PythonMyeloidsOnlyTrain.mat',
                which_set='full',
                step=2,
                one_hot=1)
train.apply_preprocessor(preprocessor=preprocessor1, can_fit=True)
train.apply_preprocessor(preprocessor=preprocessor2)
train.apply_preprocessor(preprocessor=preprocessor3)
train.use_design_loc(output_dir + '/train.npy')
serial.save(output_dir + '/train.pkl', train)

test = MATDATA(path=data_dir + '/classData19062014PythonMyeloidsOnlyTest.mat',
               which_set='full',
               step=2,
def test_works():
    load = True
    if load == False:
        ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000)
        ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049)
        # valid can_fit = false
        pipeline = preprocessing.Pipeline()
        stndrdz = preprocessing.Standardize()
        stndrdz.apply(ddmTrain, can_fit=True)
        #doubt, how about can_fit = False?
        stndrdz.apply(ddmValid, can_fit=False)
        GCN = preprocessing.GlobalContrastNormalization()
        GCN.apply(ddmTrain, can_fit=True)
        GCN.apply(ddmValid, can_fit=False)

        pcklFile = open('kpd.pkl', 'wb')
        obj = (ddmTrain, ddmValid)
        pickle.dump(obj, pcklFile)
        pcklFile.close()
        return
    else:
        pcklFile = open('kpd.pkl', 'rb')
        (ddmTrain, ddmValid) = pickle.load(pcklFile)
        pcklFile.close()

    #creating layers
    #2 convolutional rectified layers, border mode valid
    layer1 = ConvRectifiedLinear(layer_name='convRect1',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)
    layer2 = ConvRectifiedLinear(layer_name='convRect2',
                                 output_channels=64,
                                 irange=.05,
                                 kernel_shape=[5, 5],
                                 pool_shape=[3, 3],
                                 pool_stride=[2, 2],
                                 max_kernel_norm=1.9365)

    # Rectified linear units
    layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3')

    #multisoftmax
    n_groups = 30
    n_classes = 98
    irange = 0
    layer_name = 'multisoftmax'
    layerMS = MultiSoftmax(n_groups=n_groups,
                           irange=0.05,
                           n_classes=n_classes,
                           layer_name=layer_name)

    #setting up MLP
    MLPerc = MLP(batch_size=8,
                 input_space=Conv2DSpace(shape=[96, 96], num_channels=1),
                 layers=[layer1, layer2, layer3, layerMS])

    #mlp_cost
    missing_target_value = -1
    mlp_cost = MLPCost(cost_type='default',
                       missing_target_value=missing_target_value)

    #algorithm

    # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria

    term_crit = MonitorBased(prop_decrease=0.00001,
                             N=30,
                             channel_name='validation_objective')
    kpSGD = KeypointSGD(learning_rate=0.001,
                        init_momentum=0.5,
                        monitoring_dataset={
                            'validation': ddmValid,
                            'training': ddmTrain
                        },
                        batch_size=8,
                        batches_per_iter=750,
                        termination_criterion=term_crit,
                        train_iteration_mode='random_uniform',
                        cost=mlp_cost)

    #train extension
    train_ext = ExponentialDecayOverEpoch(decay_factor=0.998,
                                          min_lr_scale=0.01)
    #train object
    train = Train(dataset=ddmTrain,
                  save_path='kpd_model2.pkl',
                  save_freq=1,
                  model=MLPerc,
                  algorithm=kpSGD,
                  extensions=[
                      train_ext,
                      MonitorBasedSaveBest(channel_name='validation_objective',
                                           save_path='kpd_best.pkl'),
                      MomentumAdjustor(start=1, saturate=20, final_momentum=.9)
                  ])
    train.main_loop()
    train.save()
Beispiel #5
0
def get_dataset(which_data, tot=False):
    train_path = DATA_DIR + 'train' + which_data + '_preprocessed.pkl'
    valid_path = DATA_DIR + 'valid' + which_data + '_preprocessed.pkl'
    tottrain_path = DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl'
    test_path = DATA_DIR + 'test' + which_data + '_preprocessed.pkl'

    if os.path.exists(train_path) and os.path.exists(
            valid_path) and os.path.exists(test_path):

        print 'loading preprocessed data'
        trainset = serial.load(train_path)
        validset = serial.load(valid_path)
        if tot:
            tottrainset = serial.load(tottrain_path)
        testset = serial.load(test_path)
    else:

        print 'loading raw data...'
        trainset = Whales(which_set="train",
                          which_data=which_data,
                          start=0,
                          stop=56671)
        validset = Whales(which_set="train",
                          which_data=which_data,
                          start=56671,
                          stop=66671)
        tottrainset = Whales(which_set="train", which_data=which_data)
        testset = Whales(which_set="test", which_data=which_data)

        print 'preprocessing data...'
        pipeline = preprocessing.Pipeline()

        if which_data == 'melspectrum':
            pipeline.items.append(
                preprocessing.Standardize(global_mean=True, global_std=True))
            # ZCA = zero-phase component analysis
            # very similar to PCA, but preserves the look of the original image better
            pipeline.items.append(preprocessing.ZCA())
        else:
            # global_mean/std=False voor per-feature standardization
            pipeline.items.append(
                preprocessing.Standardize(global_mean=False, global_std=False))

        trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        # this uses numpy format for storage instead of pickle, for memory reasons
        trainset.use_design_loc(DATA_DIR + 'train_' + which_data +
                                '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        validset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        validset.use_design_loc(DATA_DIR + 'valid_' + which_data +
                                '_design.npy')
        tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True)
        tottrainset.use_design_loc(DATA_DIR + 'tottrain_' + which_data +
                                   '_design.npy')
        # note the can_fit=False: no sharing between train and test data
        testset.apply_preprocessor(preprocessor=pipeline, can_fit=False)
        testset.use_design_loc(DATA_DIR + 'test_' + which_data + '_design.npy')

        # this path can be used for visualizing weights after training is done
        trainset.yaml_src = '!pkl: "%s"' % train_path
        validset.yaml_src = '!pkl: "%s"' % valid_path
        tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path
        testset.yaml_src = '!pkl: "%s"' % test_path

        print 'saving preprocessed data...'
        serial.save(DATA_DIR + 'train' + which_data + '_preprocessed.pkl',
                    trainset)
        serial.save(DATA_DIR + 'valid' + which_data + '_preprocessed.pkl',
                    validset)
        serial.save(DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl',
                    tottrainset)
        serial.save(DATA_DIR + 'test' + which_data + '_preprocessed.pkl',
                    testset)

    if tot:
        return tottrainset, validset, testset
    else:
        return trainset, validset, testset