Example #1
0
def create_datasets(all_data):
    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    finaltrain_data, finaltrain_metadata = all_data['finaltrain']
    test_data, test_metadata = all_data['test']
    lbl = np.array([int(data[1]) for data in test_data])
    spatial_dimensions = 1

    def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
        mlproblem_metadata['input_size'] = 3  # we need to change the input size from 6 to 3. 
        return [mlproblem_data[0][:3] , mlproblem_data[1]]

    if spatial_dimensions ==1:      
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(train_data, train_metadata)
        validset = trainset.apply_on(valid_data,valid_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data,finaltrain_metadata)
        testset = trainset.apply_on(test_data,test_metadata)

    elif spatial_dimensions ==0:
        import mlpython.mlproblems.generic as mlpg
        trainset = mlpg.PreprocessedProblem(data = train_data , metadata = train_metadata , preprocess = reduce_dimensionality)
        validset = trainset.apply_on(valid_data, valid_metadata)
        testset = trainset.apply_on(test_data, test_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
        validset = trainset.apply_on(validset,validset.metadata)
        finaltrainset = trainset.apply_on(finaltrainset,finaltrainset.metadata)
        testset = trainset.apply_on(testset,testset.metadata)

    return {'trainset':trainset,'validset':validset ,'finaltrainset':finaltrainset, 'testset':testset ,'ground_truth':lbl}
Example #2
0
def load_data(dataset_directory, dataset_name):
    print "Loading datasets ..."
    import os
    repo = os.environ.get('MLPYTHON_DATASET_REPO')
    if repo is None:
        raise ValueError(
            'environment variable MLPYTHON_DATASET_REPO is not defined')
    dataset_dir = os.path.join(
        os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory,
        dataset_name)

    input_size = 6
    spatial_dimensions = 1
    all_data = data_utils.load_data(dir_path=dataset_dir,
                                    input_size=input_size,
                                    train_filename=None,
                                    test_filename=None,
                                    background_filename=None,
                                    load_to_memory=False)

    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    finaltrain_data, finaltrain_metadata = all_data['finaltrain']
    test_data, test_metadata = all_data['test']
    lbl = np.array([int(data[1]) for data in test_data])

    def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
        mlproblem_metadata[
            'input_size'] = 3  # we need to change the input size from 6 to 3.
        return [mlproblem_data[0][:3], mlproblem_data[1]]

    if spatial_dimensions == 1:
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(train_data, train_metadata)
        validset = trainset.apply_on(valid_data, valid_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        testset = trainset.apply_on(test_data, test_metadata)

    elif spatial_dimensions == 0:
        import mlpython.mlproblems.generic as mlpg
        trainset = mlpg.PreprocessedProblem(data=train_data,
                                            metadata=train_metadata,
                                            preprocess=reduce_dimensionality)
        validset = trainset.apply_on(valid_data, valid_metadata)
        testset = trainset.apply_on(test_data, test_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
        validset = trainset.apply_on(validset, validset.metadata)
        finaltrainset = trainset.apply_on(finaltrainset,
                                          finaltrainset.metadata)
        testset = trainset.apply_on(testset, testset.metadata)

    return {
        'finaltrainset': finaltrainset,
        'testset': testset,
        'ground_truth': lbl
    }
Example #3
0
for i,hidden_size in enumerate(sizes):

    # Defining function that maps dataset
    # into last trained representation
    def new_representation(example,metadata):
        ret = example[0]
        for W,b in itertools.izip(pretrained_Ws,pretrained_bs):
            ret = 1./(1+np.exp(-(b + np.dot(ret,W))))
        return ret

    # Create greedy module training set using PreprocessedProblem
    if i == 0:
        new_input_size = trainset.metadata['input_size']
    else:
        new_input_size = sizes[i-1]
    pretraining_trainset = mlpb.PreprocessedProblem(trainset,preprocess=new_representation,
                                                    metadata={'input_size':new_input_size})

    # Train greedy module
    print '... hidden layer ' + str(i+1),
    new_layer = Autoencoder(n_epochs = pretrain_n_epochs,
                            hidden_size = hidden_size,
                            lr = pretrain_lr,
                            noise_prob = pretrain_noise_prob,
                            seed=seed
                            )
    new_layer.train(pretraining_trainset)
    print ' DONE'

    pretrained_Ws += [new_layer.W]
    pretrained_bs += [new_layer.b]
Example #4
0
    mlproblem_metadata[
        'input_size'] = 3  # we need to change the input size from 6 to 3.
    return [mlproblem_data[0][:3], mlproblem_data[1]]


if spatial_dimensions == 1:
    import mlpython.mlproblems.classification as mlpb
    trainset = mlpb.ClassificationProblem(train_data, train_metadata)
    validset = trainset.apply_on(valid_data, valid_metadata)
    finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
    testset = trainset.apply_on(test_data, test_metadata)

elif spatial_dimensions == 0:
    import mlpython.mlproblems.generic as mlpg
    trainset = mlpg.PreprocessedProblem(data=train_data,
                                        metadata=train_metadata,
                                        preprocess=reduce_dimensionality)
    validset = trainset.apply_on(valid_data, valid_metadata)
    testset = trainset.apply_on(test_data, test_metadata)
    finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
    import mlpython.mlproblems.classification as mlpb
    trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
    validset = trainset.apply_on(validset, validset.metadata)
    finaltrainset = trainset.apply_on(finaltrainset, finaltrainset.metadata)
    testset = trainset.apply_on(testset, testset.metadata)
'''
for d,i in enumerate(testset):
    print i
    if d>10:
        break
Example #5
0
   def initialize(self,trainset):

       metadata = trainset.metadata
       self.n_classes = len(metadata['targets'])
       self.rng = np.random.mtrand.RandomState(self.seed)
       self.input_size = metadata['input_size']
       if self.n_k_means_inputs > self.input_size or self.n_k_means_inputs < 1:
           self.n_k_means_inputs = self.input_size
       
       self.Ws = []
       self.cs = []
       self.Vs = []
       self.dWs = []
       self.dcs = []
       self.dVs = []
       self.layers = []
       self.layer_acts = []
       self.dlayers = []
       self.dlayer_acts = []
       self.output_acts = []
       
       self.input = np.zeros((self.input_size,))
       self.d = np.zeros((self.n_classes,))
       self.dd = np.zeros((self.n_classes,))
       self.output = np.zeros((self.n_classes,))
       self.output_act = np.zeros((self.n_classes,))
       self.doutput_act = np.zeros((self.n_classes,))
       
       self.cluster_indices = np.zeros((self.n_k_means,),dtype='int')
              
       for k in range(self.n_k_means):
           for c in range(self.n_clusters):
               self.Ws += [(2*self.rng.rand(self.hidden_size,self.input_size)-1)/self.n_k_means_inputs]
               self.cs += [np.zeros((self.hidden_size))]
               self.Vs += [(2*self.rng.rand(self.n_classes,self.hidden_size)-1)/(self.hidden_size*self.n_k_means)]
       
               self.dWs += [np.zeros((self.hidden_size,self.input_size))]
               self.dcs += [np.zeros((self.hidden_size))]
               self.dVs += [np.zeros((self.n_classes,self.hidden_size))]
               
           self.layers += [np.zeros((self.hidden_size))]
           self.layer_acts += [np.zeros((self.hidden_size))]
       
           self.dlayers += [np.zeros((self.hidden_size))]
           self.dlayer_acts += [np.zeros((self.hidden_size))]
           self.output_acts += [np.zeros((self.n_classes,))]

       # Denoising autoencoder variables
       if self.autoencoder_regularization != 0:
           self.dae_dWs = []
           self.dae_dWsT = []

           self.input_idx = np.arange(self.input_size)
           self.dae_layers = []
           self.dae_layer_acts = []
           self.dae_dlayers = []
           self.dae_dlayer_acts = []
           self.dae_output_acts = []
           self.dae_input = np.zeros((self.input_size,))
           self.dae_d = np.zeros((self.input_size,))
           self.dae_dd = np.zeros((self.input_size,))
           self.dae_output = np.zeros((self.input_size,))
           self.dae_output_act = np.zeros((self.input_size,))
           self.dae_doutput_act = np.zeros((self.input_size,))

           for k in range(self.n_k_means):
               for c in range(self.n_clusters):
                   self.dae_dWs += [np.zeros((self.hidden_size,self.input_size))]
                   self.dae_dWsT += [np.zeros((self.input_size,self.hidden_size))]
                   
               self.dae_layers += [np.zeros((self.hidden_size))]
               self.dae_layer_acts += [np.zeros((self.hidden_size))]
               
               self.dae_dlayers += [np.zeros((self.hidden_size))]
               self.dae_dlayer_acts += [np.zeros((self.hidden_size))]
               self.dae_output_acts += [np.zeros((self.input_size,))]


       # Running k-means
       self.clusterings = []
       self.k_means_subset_inputs = []
       for k in range(self.n_k_means):
           clustering = mlfeat.k_means(n_clusters=self.n_clusters, 
                                       n_stages=self.n_k_means_stages)
           # Generate training set for k-means
           if self.n_k_means_inputs == self.input_size:
               self.k_means_subset_inputs += [None]
               def subset(ex,meta):
                   meta['input_size'] = self.n_k_means_inputs
                   return ex[0]
           else:
               subset_indices = np.arange(self.input_size)
               self.rng.shuffle(subset_indices)
               subset_indices = subset_indices[:self.n_k_means_inputs]
               self.k_means_subset_inputs += [subset_indices]
               def subset(ex,meta):
                   meta['input_size'] = self.n_k_means_inputs
                   return ex[0][subset_indices]
           k_means_trainset = mlpbgen.PreprocessedProblem(trainset,preprocess=subset)
           clustering.train(k_means_trainset)
       
           self.clusterings += [clustering]
       
       self.n_updates = 0
Example #6
0
pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
print 'pb2:'
for example in pb2:
    print example
print 'metadata:', pb2.metadata
print ''

pb3 = mlpbgen.MergedProblem([pb2, pb1])
print 'pb3:'
for example in pb3:
    print example
print 'metadata:', pb3.metadata
print ''

pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
print 'pb4:'
for example in pb4:
    print example
print 'metadata:', pb4.metadata
print ''

pb5 = mlpbclass.ClassificationProblem(pb4)
print 'pb5:'
for example in pb5:
    print example
print 'metadata:', pb5.metadata
print ''

pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
print 'pb6 (final):'
Example #7
0
def test_mlproblem_combinations():
    """
    Test a combination of many different MLProblems.
    """

    raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B'])
    metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    def features(example, metadata):
        metadata['input_size'] = 2
        return ((example[0], example[0]), example[1])

    pb1 = mlpbgen.MLProblem(raw_data, metadata)
    print 'pb1', pb1.metadata
    pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
    print 'pb2', pb2.metadata
    pb3 = mlpbgen.MergedProblem([pb2, pb1])
    print 'pb3', pb3.metadata
    pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
    print 'pb4', pb4.metadata
    pb5 = mlpbclass.ClassificationProblem(pb4)
    print 'pb5', pb5.metadata
    pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
    print 'pb6', pb6.metadata
    pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
    print 'pb7', pb7.metadata

    final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0),
                                                             0],
                  [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4),
                                                             0]]
    final_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pb7, final_data):
        assert cmp(ex1, ex2) == 0
    print pb7.metadata, final_metadata
    assert cmp(pb7.metadata, final_metadata) == 0

    raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
    metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    pbtest = pb7.apply_on(raw_data2, metadata2)
    final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0],
                       [(9, 9), (9, 9), 1]]
    final_test_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pbtest, final_test_data):
        assert cmp(ex1, ex2) == 0
    assert cmp(pbtest.metadata, final_test_metadata) == 0