def create_datasets(all_data): train_data, train_metadata = all_data['train'] valid_data, valid_metadata = all_data['valid'] finaltrain_data, finaltrain_metadata = all_data['finaltrain'] test_data, test_metadata = all_data['test'] lbl = np.array([int(data[1]) for data in test_data]) spatial_dimensions = 1 def reduce_dimensionality(mlproblem_data, mlproblem_metadata): mlproblem_metadata['input_size'] = 3 # we need to change the input size from 6 to 3. return [mlproblem_data[0][:3] , mlproblem_data[1]] if spatial_dimensions ==1: import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(train_data, train_metadata) validset = trainset.apply_on(valid_data,valid_metadata) finaltrainset = trainset.apply_on(finaltrain_data,finaltrain_metadata) testset = trainset.apply_on(test_data,test_metadata) elif spatial_dimensions ==0: import mlpython.mlproblems.generic as mlpg trainset = mlpg.PreprocessedProblem(data = train_data , metadata = train_metadata , preprocess = reduce_dimensionality) validset = trainset.apply_on(valid_data, valid_metadata) testset = trainset.apply_on(test_data, test_metadata) finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata) import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(trainset, trainset.metadata) validset = trainset.apply_on(validset,validset.metadata) finaltrainset = trainset.apply_on(finaltrainset,finaltrainset.metadata) testset = trainset.apply_on(testset,testset.metadata) return {'trainset':trainset,'validset':validset ,'finaltrainset':finaltrainset, 'testset':testset ,'ground_truth':lbl}
def load_data(dataset_directory, dataset_name): print "Loading datasets ..." import os repo = os.environ.get('MLPYTHON_DATASET_REPO') if repo is None: raise ValueError( 'environment variable MLPYTHON_DATASET_REPO is not defined') dataset_dir = os.path.join( os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory, dataset_name) input_size = 6 spatial_dimensions = 1 all_data = data_utils.load_data(dir_path=dataset_dir, input_size=input_size, train_filename=None, test_filename=None, background_filename=None, load_to_memory=False) train_data, train_metadata = all_data['train'] valid_data, valid_metadata = all_data['valid'] finaltrain_data, finaltrain_metadata = all_data['finaltrain'] test_data, test_metadata = all_data['test'] lbl = np.array([int(data[1]) for data in test_data]) def reduce_dimensionality(mlproblem_data, mlproblem_metadata): mlproblem_metadata[ 'input_size'] = 3 # we need to change the input size from 6 to 3. return [mlproblem_data[0][:3], mlproblem_data[1]] if spatial_dimensions == 1: import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(train_data, train_metadata) validset = trainset.apply_on(valid_data, valid_metadata) finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata) testset = trainset.apply_on(test_data, test_metadata) elif spatial_dimensions == 0: import mlpython.mlproblems.generic as mlpg trainset = mlpg.PreprocessedProblem(data=train_data, metadata=train_metadata, preprocess=reduce_dimensionality) validset = trainset.apply_on(valid_data, valid_metadata) testset = trainset.apply_on(test_data, test_metadata) finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata) import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(trainset, trainset.metadata) validset = trainset.apply_on(validset, validset.metadata) finaltrainset = trainset.apply_on(finaltrainset, finaltrainset.metadata) testset = trainset.apply_on(testset, testset.metadata) return { 'finaltrainset': finaltrainset, 'testset': testset, 'ground_truth': lbl }
for i,hidden_size in enumerate(sizes): # Defining function that maps dataset # into last trained representation def new_representation(example,metadata): ret = example[0] for W,b in itertools.izip(pretrained_Ws,pretrained_bs): ret = 1./(1+np.exp(-(b + np.dot(ret,W)))) return ret # Create greedy module training set using PreprocessedProblem if i == 0: new_input_size = trainset.metadata['input_size'] else: new_input_size = sizes[i-1] pretraining_trainset = mlpb.PreprocessedProblem(trainset,preprocess=new_representation, metadata={'input_size':new_input_size}) # Train greedy module print '... hidden layer ' + str(i+1), new_layer = Autoencoder(n_epochs = pretrain_n_epochs, hidden_size = hidden_size, lr = pretrain_lr, noise_prob = pretrain_noise_prob, seed=seed ) new_layer.train(pretraining_trainset) print ' DONE' pretrained_Ws += [new_layer.W] pretrained_bs += [new_layer.b]
mlproblem_metadata[ 'input_size'] = 3 # we need to change the input size from 6 to 3. return [mlproblem_data[0][:3], mlproblem_data[1]] if spatial_dimensions == 1: import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(train_data, train_metadata) validset = trainset.apply_on(valid_data, valid_metadata) finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata) testset = trainset.apply_on(test_data, test_metadata) elif spatial_dimensions == 0: import mlpython.mlproblems.generic as mlpg trainset = mlpg.PreprocessedProblem(data=train_data, metadata=train_metadata, preprocess=reduce_dimensionality) validset = trainset.apply_on(valid_data, valid_metadata) testset = trainset.apply_on(test_data, test_metadata) finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata) import mlpython.mlproblems.classification as mlpb trainset = mlpb.ClassificationProblem(trainset, trainset.metadata) validset = trainset.apply_on(validset, validset.metadata) finaltrainset = trainset.apply_on(finaltrainset, finaltrainset.metadata) testset = trainset.apply_on(testset, testset.metadata) ''' for d,i in enumerate(testset): print i if d>10: break
def initialize(self,trainset): metadata = trainset.metadata self.n_classes = len(metadata['targets']) self.rng = np.random.mtrand.RandomState(self.seed) self.input_size = metadata['input_size'] if self.n_k_means_inputs > self.input_size or self.n_k_means_inputs < 1: self.n_k_means_inputs = self.input_size self.Ws = [] self.cs = [] self.Vs = [] self.dWs = [] self.dcs = [] self.dVs = [] self.layers = [] self.layer_acts = [] self.dlayers = [] self.dlayer_acts = [] self.output_acts = [] self.input = np.zeros((self.input_size,)) self.d = np.zeros((self.n_classes,)) self.dd = np.zeros((self.n_classes,)) self.output = np.zeros((self.n_classes,)) self.output_act = np.zeros((self.n_classes,)) self.doutput_act = np.zeros((self.n_classes,)) self.cluster_indices = np.zeros((self.n_k_means,),dtype='int') for k in range(self.n_k_means): for c in range(self.n_clusters): self.Ws += [(2*self.rng.rand(self.hidden_size,self.input_size)-1)/self.n_k_means_inputs] self.cs += [np.zeros((self.hidden_size))] self.Vs += [(2*self.rng.rand(self.n_classes,self.hidden_size)-1)/(self.hidden_size*self.n_k_means)] self.dWs += [np.zeros((self.hidden_size,self.input_size))] self.dcs += [np.zeros((self.hidden_size))] self.dVs += [np.zeros((self.n_classes,self.hidden_size))] self.layers += [np.zeros((self.hidden_size))] self.layer_acts += [np.zeros((self.hidden_size))] self.dlayers += [np.zeros((self.hidden_size))] self.dlayer_acts += [np.zeros((self.hidden_size))] self.output_acts += [np.zeros((self.n_classes,))] # Denoising autoencoder variables if self.autoencoder_regularization != 0: self.dae_dWs = [] self.dae_dWsT = [] self.input_idx = np.arange(self.input_size) self.dae_layers = [] self.dae_layer_acts = [] self.dae_dlayers = [] self.dae_dlayer_acts = [] self.dae_output_acts = [] self.dae_input = np.zeros((self.input_size,)) self.dae_d = np.zeros((self.input_size,)) self.dae_dd = np.zeros((self.input_size,)) self.dae_output = np.zeros((self.input_size,)) self.dae_output_act = np.zeros((self.input_size,)) self.dae_doutput_act = np.zeros((self.input_size,)) for k in range(self.n_k_means): for c in range(self.n_clusters): self.dae_dWs += [np.zeros((self.hidden_size,self.input_size))] self.dae_dWsT += [np.zeros((self.input_size,self.hidden_size))] self.dae_layers += [np.zeros((self.hidden_size))] self.dae_layer_acts += [np.zeros((self.hidden_size))] self.dae_dlayers += [np.zeros((self.hidden_size))] self.dae_dlayer_acts += [np.zeros((self.hidden_size))] self.dae_output_acts += [np.zeros((self.input_size,))] # Running k-means self.clusterings = [] self.k_means_subset_inputs = [] for k in range(self.n_k_means): clustering = mlfeat.k_means(n_clusters=self.n_clusters, n_stages=self.n_k_means_stages) # Generate training set for k-means if self.n_k_means_inputs == self.input_size: self.k_means_subset_inputs += [None] def subset(ex,meta): meta['input_size'] = self.n_k_means_inputs return ex[0] else: subset_indices = np.arange(self.input_size) self.rng.shuffle(subset_indices) subset_indices = subset_indices[:self.n_k_means_inputs] self.k_means_subset_inputs += [subset_indices] def subset(ex,meta): meta['input_size'] = self.n_k_means_inputs return ex[0][subset_indices] k_means_trainset = mlpbgen.PreprocessedProblem(trainset,preprocess=subset) clustering.train(k_means_trainset) self.clusterings += [clustering] self.n_updates = 0
pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5])) print 'pb2:' for example in pb2: print example print 'metadata:', pb2.metadata print '' pb3 = mlpbgen.MergedProblem([pb2, pb1]) print 'pb3:' for example in pb3: print example print 'metadata:', pb3.metadata print '' pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features) print 'pb4:' for example in pb4: print example print 'metadata:', pb4.metadata print '' pb5 = mlpbclass.ClassificationProblem(pb4) print 'pb5:' for example in pb5: print example print 'metadata:', pb5.metadata print '' pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C'])) print 'pb6 (final):'
def test_mlproblem_combinations(): """ Test a combination of many different MLProblems. """ raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B']) metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1} def features(example, metadata): metadata['input_size'] = 2 return ((example[0], example[0]), example[1]) pb1 = mlpbgen.MLProblem(raw_data, metadata) print 'pb1', pb1.metadata pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5])) print 'pb2', pb2.metadata pb3 = mlpbgen.MergedProblem([pb2, pb1]) print 'pb3', pb3.metadata pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features) print 'pb4', pb4.metadata pb5 = mlpbclass.ClassificationProblem(pb4) print 'pb5', pb5.metadata pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C'])) print 'pb6', pb6.metadata pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1]) print 'pb7', pb7.metadata final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0), 0], [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4), 0]] final_metadata = { 'input_size': 2, 'targets': set(['A', 'C']), 'class_to_id': { 'A': 0, 'C': 1 } } for ex1, ex2 in zip(pb7, final_data): assert cmp(ex1, ex2) == 0 print pb7.metadata, final_metadata assert cmp(pb7.metadata, final_metadata) == 0 raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C']) metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1} pbtest = pb7.apply_on(raw_data2, metadata2) final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0], [(9, 9), (9, 9), 1]] final_test_metadata = { 'input_size': 2, 'targets': set(['A', 'C']), 'class_to_id': { 'A': 0, 'C': 1 } } for ex1, ex2 in zip(pbtest, final_test_data): assert cmp(ex1, ex2) == 0 assert cmp(pbtest.metadata, final_test_metadata) == 0