def get_preprocess_standardize(self, preprocess_id): row = self.db.executeSQL( """ SELECT global_mean, global_std, std_eps FROM hps3.preprocess_standardize WHERE preprocess_id = %s """, (preprocess_id, ), self.db.FETCH_ONE) if not row or row is None: raise HPSData("No standardize preprocess for preprocess_id="\ +str(preprocess_id)) (global_mean, global_std, std_eps) = row return pp.Standardize(global_mean=global_mean, global_std=global_std, std_eps=std_eps)
def process_data(): # pre-process unsupervised data if not os.path.exists(DATA_DIR+'preprocess.pkl') \ or not os.path.exists(DATA_DIR+'unsup_prep_data.pkl') \ or not os.path.exists(DATA_DIR+'sup_prep_data.pkl'): unsup_data = black_box_dataset.BlackBoxDataset('extra') pipeline = preprocessing.Pipeline() pipeline.items.append( preprocessing.Standardize(global_mean=False, global_std=False)) #pipeline.items.append(preprocessing.ZCA(filter_bias=.1)) unsup_data.apply_preprocessor(preprocessor=pipeline, can_fit=True) serial.save(DATA_DIR + 'preprocess.pkl', pipeline) # why the hell do I get pickling errors if I use serial here? solve by pickling myself #serial.save(DATA_DIR+'unsup_prep_data.pkl', unsup_data) out = open(DATA_DIR + 'unsup_prep_data.pkl', 'w') pickle.dump(unsup_data, out) out.close() # process supervised training data sup_data = [] which_data = ['train'] * 3 + ['public_test'] starts = [0, 800, None, None] stops = [800, 1000, None, None] fits = [False, False, False, False] for curstr, start, stop, fit in zip(which_data, starts, stops, fits): sup_data.append( black_box_dataset.BlackBoxDataset(which_set=curstr, start=start, stop=stop, preprocessor=pipeline, fit_preprocessor=fit)) serial.save(DATA_DIR + 'sup_prep_data.pkl', sup_data) else: pipeline = serial.load(DATA_DIR + 'preprocess.pkl') #unsup_data = serial.load(DATA_DIR+'unsup_prep_data.pkl') unsup_data = pickle.load(open(DATA_DIR + 'unsup_prep_data.pkl', 'r')) sup_data = serial.load(DATA_DIR + 'sup_prep_data.pkl') return unsup_data, sup_data
from pylearn2.utils import serial from pylearn2.datasets import mike_preprocessing from pylearn2.datasets import preprocessing from pylearn2.utils import string_utils from pylearn2.datasets.mat_data import MATDATA import datetime data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/wbc') output_dir = data_dir + '/augmented1906' serial.mkdir(output_dir) preprocessor1 = preprocessing.Standardize(global_std=True) preprocessor2 = mike_preprocessing.AugmentAndBalance() preprocessor3 = preprocessing.ShuffleAndSplit( seed=datetime.datetime.now().microsecond, start=0, stop=11000) train = MATDATA(path=data_dir + '/classData19062014PythonMyeloidsOnlyTrain.mat', which_set='full', step=2, one_hot=1) train.apply_preprocessor(preprocessor=preprocessor1, can_fit=True) train.apply_preprocessor(preprocessor=preprocessor2) train.apply_preprocessor(preprocessor=preprocessor3) train.use_design_loc(output_dir + '/train.npy') serial.save(output_dir + '/train.pkl', train) test = MATDATA(path=data_dir + '/classData19062014PythonMyeloidsOnlyTest.mat', which_set='full', step=2,
def test_works(): load = True if load == False: ddmTrain = FacialKeypoint(which_set='train', start=0, stop=6000) ddmValid = FacialKeypoint(which_set='train', start=6000, stop=7049) # valid can_fit = false pipeline = preprocessing.Pipeline() stndrdz = preprocessing.Standardize() stndrdz.apply(ddmTrain, can_fit=True) #doubt, how about can_fit = False? stndrdz.apply(ddmValid, can_fit=False) GCN = preprocessing.GlobalContrastNormalization() GCN.apply(ddmTrain, can_fit=True) GCN.apply(ddmValid, can_fit=False) pcklFile = open('kpd.pkl', 'wb') obj = (ddmTrain, ddmValid) pickle.dump(obj, pcklFile) pcklFile.close() return else: pcklFile = open('kpd.pkl', 'rb') (ddmTrain, ddmValid) = pickle.load(pcklFile) pcklFile.close() #creating layers #2 convolutional rectified layers, border mode valid layer1 = ConvRectifiedLinear(layer_name='convRect1', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) layer2 = ConvRectifiedLinear(layer_name='convRect2', output_channels=64, irange=.05, kernel_shape=[5, 5], pool_shape=[3, 3], pool_stride=[2, 2], max_kernel_norm=1.9365) # Rectified linear units layer3 = RectifiedLinear(dim=3000, sparse_init=15, layer_name='RectLin3') #multisoftmax n_groups = 30 n_classes = 98 irange = 0 layer_name = 'multisoftmax' layerMS = MultiSoftmax(n_groups=n_groups, irange=0.05, n_classes=n_classes, layer_name=layer_name) #setting up MLP MLPerc = MLP(batch_size=8, input_space=Conv2DSpace(shape=[96, 96], num_channels=1), layers=[layer1, layer2, layer3, layerMS]) #mlp_cost missing_target_value = -1 mlp_cost = MLPCost(cost_type='default', missing_target_value=missing_target_value) #algorithm # learning rate, momentum, batch size, monitoring dataset, cost, termination criteria term_crit = MonitorBased(prop_decrease=0.00001, N=30, channel_name='validation_objective') kpSGD = KeypointSGD(learning_rate=0.001, init_momentum=0.5, monitoring_dataset={ 'validation': ddmValid, 'training': ddmTrain }, batch_size=8, batches_per_iter=750, termination_criterion=term_crit, train_iteration_mode='random_uniform', cost=mlp_cost) #train extension train_ext = ExponentialDecayOverEpoch(decay_factor=0.998, min_lr_scale=0.01) #train object train = Train(dataset=ddmTrain, save_path='kpd_model2.pkl', save_freq=1, model=MLPerc, algorithm=kpSGD, extensions=[ train_ext, MonitorBasedSaveBest(channel_name='validation_objective', save_path='kpd_best.pkl'), MomentumAdjustor(start=1, saturate=20, final_momentum=.9) ]) train.main_loop() train.save()
def get_dataset(which_data, tot=False): train_path = DATA_DIR + 'train' + which_data + '_preprocessed.pkl' valid_path = DATA_DIR + 'valid' + which_data + '_preprocessed.pkl' tottrain_path = DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl' test_path = DATA_DIR + 'test' + which_data + '_preprocessed.pkl' if os.path.exists(train_path) and os.path.exists( valid_path) and os.path.exists(test_path): print 'loading preprocessed data' trainset = serial.load(train_path) validset = serial.load(valid_path) if tot: tottrainset = serial.load(tottrain_path) testset = serial.load(test_path) else: print 'loading raw data...' trainset = Whales(which_set="train", which_data=which_data, start=0, stop=56671) validset = Whales(which_set="train", which_data=which_data, start=56671, stop=66671) tottrainset = Whales(which_set="train", which_data=which_data) testset = Whales(which_set="test", which_data=which_data) print 'preprocessing data...' pipeline = preprocessing.Pipeline() if which_data == 'melspectrum': pipeline.items.append( preprocessing.Standardize(global_mean=True, global_std=True)) # ZCA = zero-phase component analysis # very similar to PCA, but preserves the look of the original image better pipeline.items.append(preprocessing.ZCA()) else: # global_mean/std=False voor per-feature standardization pipeline.items.append( preprocessing.Standardize(global_mean=False, global_std=False)) trainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) # this uses numpy format for storage instead of pickle, for memory reasons trainset.use_design_loc(DATA_DIR + 'train_' + which_data + '_design.npy') # note the can_fit=False: no sharing between train and test data validset.apply_preprocessor(preprocessor=pipeline, can_fit=False) validset.use_design_loc(DATA_DIR + 'valid_' + which_data + '_design.npy') tottrainset.apply_preprocessor(preprocessor=pipeline, can_fit=True) tottrainset.use_design_loc(DATA_DIR + 'tottrain_' + which_data + '_design.npy') # note the can_fit=False: no sharing between train and test data testset.apply_preprocessor(preprocessor=pipeline, can_fit=False) testset.use_design_loc(DATA_DIR + 'test_' + which_data + '_design.npy') # this path can be used for visualizing weights after training is done trainset.yaml_src = '!pkl: "%s"' % train_path validset.yaml_src = '!pkl: "%s"' % valid_path tottrainset.yaml_src = '!pkl: "%s"' % tottrain_path testset.yaml_src = '!pkl: "%s"' % test_path print 'saving preprocessed data...' serial.save(DATA_DIR + 'train' + which_data + '_preprocessed.pkl', trainset) serial.save(DATA_DIR + 'valid' + which_data + '_preprocessed.pkl', validset) serial.save(DATA_DIR + 'tottrain' + which_data + '_preprocessed.pkl', tottrainset) serial.save(DATA_DIR + 'test' + which_data + '_preprocessed.pkl', testset) if tot: return tottrainset, validset, testset else: return trainset, validset, testset