Exemple #1
0
    def verify_sums_to_one(self):
        print 'WARNING: calling verify_sum_to_one reinitializes the learner'        
        self.hidden_size = 6
        input_size = 10
        self.learning_rate = 1
        epsilon=1e-6
        word_ids = np.array([0,3,7,8,9])
        word_counts = np.array([3,2,1,7,20])
        trainset = mlpb.MLProblem([(word_counts,word_ids)],
                                  {'input_size':input_size})
        self.initialize(trainset)
                
        log_sum_p = -np.inf
        sum_p = 0
        for i in range(input_size):
            for j in range(input_size):
                for k in range(input_size):
                    self.words = np.array([i,j,k],dtype="int32")
                    self.act = np.zeros((len(self.words),self.hidden_size))
                    np.add.accumulate(self.W[self.words[:-1],:],axis=0,out=self.act[1:,:])
                    if self.hidden_bias_scaled_by_document_size:
                       self.act += self.c*len(self.words)
                    else:
                       self.act += self.c
                    self.h = np.zeros((len(self.words),self.hidden_size))
                    self.apply_activation(self.act,self.h)
                    self.fprop_word_probs()
                    log_p_i = np.log(self.probs).sum()
                    m = max(log_p_i,log_sum_p)
                    log_sum_p = m + np.log(np.exp(log_p_i-m)+np.exp(log_sum_p-m))
                    sum_p += np.exp(log_p_i)

        print "Sums to",np.exp(log_sum_p)#,sum_p
Exemple #2
0
 def verify_learning(self):
     print 'WARNING: calling verify_sum_to_one reinitializes the learner'        
     self.hidden_size = 6
     input_size = 10
     self.learning_rate = 0.01
     words = np.zeros((10,))
     words[3] = 1
     words[7] = 1
     trainset = mlpb.MLProblem([words],
                               {'input_size':input_size})
     self.initialize(trainset)
     self.b[:] = 0
     words_neg = np.zeros((10,))
     for t in range(1,10001):
         p = np.exp(np.dot(words,self.b)+np.sum(np.log(1+np.exp(np.dot(self.W,words)+words.sum()*self.c))))
         s = 0
         words_neg[:] = 0
         for i in range(input_size):
             for j in range(input_size):
                 words_neg[:] = 0
                 words_neg[i] += 1
                 words_neg[j] += 1
                 s += np.exp(np.dot(words_neg,self.b)+np.sum(np.log(1+np.exp(np.dot(self.W,words_neg)+words_neg.sum()*self.c))))
         print p/s
         self.n_stages = t
         self.train(trainset)
    def get(dataset_name):
        # List of datasets that works with the current model ?
        datasets = ['adult',
                    'binarized_mnist',
                    'connect4',
                    'dna',
                    'mushrooms',
                    'nips',
                    'ocr_letters',
                    'rcv1',
                    'rcv2_russ',
                    'web']

        # Setup dataset env
        if dataset_name not in datasets:
            raise ValueError('Dataset unknown: ' + dataset_name)
        # mldataset = __import__('mlpython.datasets.' + dataset_name, globals(), locals(), [dataset_name], -1)
        datadir = os.path.join(os.getenv("MLPYTHON_DATASET_REPO"), dataset_name)

        # Verify if dataset exist and if not, download it
        if(not os.path.exists(datadir)):
            dataset_store.download(dataset_name)

        print('### Loading dataset [{0}] ...'.format(dataset_name))
        start_time = t.time()

        all_data = mldataset.load(datadir, load_to_memory=True)
        train_data, train_metadata = all_data['train']

        if dataset_name == 'binarized_mnist' or dataset_name == 'nips':
            trainset = mlpb.MLProblem(train_data, train_metadata)
        else:
            trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata)

        trainset.setup()

        valid_data, valid_metadata = all_data['valid']

        validset = trainset.apply_on(valid_data, valid_metadata)

        test_data, test_metadata = all_data['test']
        testset = trainset.apply_on(test_data, test_metadata)

        # Cleaning up, packaging and theanized
        full_dataset = {'input_size': trainset.metadata['input_size']}

        trainset_theano = theano.shared(value=Dataset._clean(trainset), borrow=True)
        validset_theano = theano.shared(value=Dataset._clean(validset), borrow=True)
        testset_theano = theano.shared(value=Dataset._clean(testset), borrow=True)

        full_dataset['train'] = {'data': trainset_theano, 'length': all_data['train'][1]['length']}
        full_dataset['valid'] = {'data': validset_theano, 'length': all_data['valid'][1]['length']}
        full_dataset['test'] = {'data': testset_theano, 'length': all_data['test'][1]['length']}

        print("(Dim:{0} Train:{1} Valid:{2} Test:{3})".format(trainset.metadata['input_size'], full_dataset['train']['length'], full_dataset['valid']['length'], full_dataset['test']['length']))
        print(get_done_text(start_time), "###")
        return full_dataset
Exemple #4
0
def load_data(dataset_name):
    datadir = root + '/data/'
    exec 'import mlpython.datasets.' + dataset_name + ' as mldataset'
    exec 'datadir = datadir + \'' + dataset_name + '/\''
    all_data = mldataset.load(datadir, load_to_memory=True)
    train_data, train_metadata = all_data['train']
    if dataset_name == 'binarized_mnist' or dataset_name == 'nips':
        trainset = mlpb.MLProblem(train_data, train_metadata)
    else:
        trainset = mlpb.SubsetFieldsProblem(train_data, train_metadata)
    trainset.setup()
    valid_data, valid_metadata = all_data['valid']
    validset = trainset.apply_on(valid_data, valid_metadata)
    test_data, test_metadata = all_data['test']
    testset = trainset.apply_on(test_data, test_metadata)

    train_X = trainset.data.mem_data[0]
    valid_X = validset.data.mem_data[0]
    test_X = testset.data.mem_data[0]
    return train_X, valid_X, test_X
name = 'ocr_letters_sequential'
import os
repo = os.environ.get('MLPYTHON_DATASET_REPO')
if repo is None:
    raise ValueError(
        'environment variable MLPYTHON_DATASET_REPO is not defined')
dataset_dir = os.environ.get('MLPYTHON_DATASET_REPO') + '/' + name

all_data = mldataset.load(dataset_dir, load_to_memory=load_to_memory)

train_data, train_metadata = all_data['train']
valid_data, valid_metadata = all_data['valid']
test_data, test_metadata = all_data['test']

import mlpython.mlproblems.generic as mlpb
trainset = mlpb.MLProblem(train_data, train_metadata)
validset = trainset.apply_on(valid_data, valid_metadata)
testset = trainset.apply_on(test_data, test_metadata)


def compute_error_mean_and_sterror(costs):
    classif_errors = np.hstack([c[0] for c in costs])
    classif_mean = classif_errors.mean()
    classif_sterror = classif_errors.std(ddof=1) / np.sqrt(
        classif_errors.shape[0])

    nll_errors = [c[1] for c in costs]
    nll_mean = np.mean(nll_errors)
    nll_sterror = np.std(nll_errors, ddof=1) / np.sqrt(len(nll_errors))

    return classif_mean, nll_mean, classif_sterror, nll_sterror
Exemple #6
0
testSetPath = sys.argv[6]
inputSize = int(sys.argv[7])

# Create DocNADE learner object
docNadeObject = DocNADE(n_stages=1,
                        hidden_size=hidden_size,
                        learning_rate=learning_rate,
                        activation_function=activation_function)

# Load the data
train_data, train_metadata = load(trainSetPath, inputSize)
valid_data, valid_metadata = load(validSetPath, inputSize)
test_data, test_metadata = load(testSetPath, inputSize)

# Create MLProblems (data structure used for data sets in MLPython)
trainset = mlpb.MLProblem(train_data, train_metadata)
validset = mlpb.MLProblem(valid_data, valid_metadata)
testset = mlpb.MLProblem(test_data, test_metadata)

# Training wiht early-stopping
best_val_error = np.inf
best_it = 0
look_ahead = 10
n_incr_error = 0  # Nb. of consecutive increase in error
for stage in range(1, max_iter + 1, 1):
    if not n_incr_error < look_ahead:
        break
    docNadeObject.n_stages = stage  # Ask for one more training iteration
    docNadeObject.train(trainset)  # Train some more
    n_incr_error += 1
Exemple #7
0
    def verify_gradients(self):
        print 'WARNING: calling verify_gradients reinitializes the learner'

        self.activation_function = "softmax"
        self.hidden_size = 6
        input_size = 10
        self.learning_rate = 0.01
        epsilon=1e-6
        word_ids = np.array([0,3,7,8,9])
        word_counts = np.array([3,2,1,7,20])
        trainset = mlpb.MLProblem([(word_counts,word_ids)],
                                  {'input_size':input_size})
        self.initialize(trainset)
        self.learning_rate = 1
        self.c = self.rng.rand(self.hidden_size)

        # Compute all derivatives
        rng_test_time = np.random.mtrand.RandomState(1234)
        tmp_rng = self.rng
        self.rng = rng_test_time
        self.fprop(word_ids,word_counts)
        self.bprop()
        self.rng = tmp_rng

        # Estimate derivatives by finite differences
        W_copy = np.array(self.W)
        lim_dW = np.zeros(self.W.shape)
        for i in range(self.W.shape[0]):
            for j in range(self.W.shape[1]):
                self.W[i,j] += epsilon
                outputs,costs = self.test(trainset)
                a = costs[0][0]
                
                self.W[i,j] -= 2.*epsilon

                outputs,costs = self.test(trainset)
                b = costs[0][0]
                self.W[i,j] += epsilon

                lim_dW[i,j] = (a-b)/(2.*epsilon)
                
        print 'dW diff.:',np.sum(np.abs(self.dW.ravel()-lim_dW.ravel()))/self.W.ravel().shape[0]

        b_copy = np.array(self.b)
        lim_db = np.zeros(self.b.shape)
        for i in range(self.b.shape[0]):
            self.b[i] += epsilon
            outputs,costs = self.test(trainset)
            a = costs[0][0]
            
            self.b[i] -= 2.*epsilon

            outputs,costs = self.test(trainset)
            b = costs[0][0]
            self.b[i] += epsilon
            
            lim_db[i] = (a-b)/(2.*epsilon)
        
        print 'db diff.:',np.sum(np.abs(self.db.ravel()-lim_db.ravel()))/self.b.ravel().shape[0]
        
        V_copy = np.array(self.V)
        lim_dV = np.zeros(self.V.shape)
        for i in range(self.V.shape[0]):
            for j in range(self.V.shape[1]):
                self.V[i,j] += epsilon
                outputs,costs = self.test(trainset)
                a = costs[0][0]
                
                self.V[i,j] -= 2.*epsilon

                outputs,costs = self.test(trainset)
                b = costs[0][0]
                self.V[i,j] += epsilon

                lim_dV[i,j] = (a-b)/(2.*epsilon)
                
        print 'dV diff.:',np.sum(np.abs(self.dV.ravel()-lim_dV.ravel()))/self.V.ravel().shape[0]

        c_copy = np.array(self.c)
        lim_dh = np.zeros(self.c.shape)
        for i in range(self.c.shape[0]):
            self.c[i] += epsilon
            outputs,costs = self.test(trainset)
            a = costs[0][0]
            
            self.c[i] -= 2.*epsilon

            outputs,costs = self.test(trainset)
            b = costs[0][0]
            self.c[i] += epsilon
            
            lim_dh[i] = (a-b)/(2.*epsilon)
        
        print 'dc diff.:',np.sum(np.abs(self.dc.ravel()-lim_dh.ravel()))/self.c.ravel().shape[0]
Exemple #8
0
        target = 2 * example[1] - 1  # Targets are 0/1
        output = np.dot(self.w, input) + self.b
        if np.sign(output) != target:
            self.w += self.lr * target * input
            self.b += self.lr * target

    def use_learner(self, example):
        return np.sign(np.dot(self.w, example[0]) + self.b)

    def cost(self, output, example):
        return int(output != 2 * example[1] - 1)


inputs = np.array([[0, 0, 1, 1], [1, 1, 0, 0]])
targets = np.array([0, 1]).T
metadata = {'input_size': 4, 'targets': set([0, 1])}
trainset = mlpb.MLProblem(zip(inputs, targets), metadata)

perceptron = Perceptron(2, lr=0.01)

perceptron.train(trainset)
print perceptron.use(trainset)
print perceptron.test(trainset)

inputs = np.array([[0, 0, 0, 1], [1, 0, 0, 0]])
targets = np.array([0, 1]).T
metadata = {'input_size': 4, 'targets': set([0, 1])}
testset = mlpb.MLProblem(zip(inputs, targets), metadata)
print perceptron.use(testset)
print perceptron.test(testset)
Exemple #9
0
# authors and should not be interpreted as representing official policies, either expressed
# or implied, of Hugo Larochelle.

import mlpython.mlproblems.generic as mlpbgen
import mlpython.mlproblems.classification as mlpbclass

raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B'])
metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1}


def features(example, metadata):
    metadata['input_size'] = 2
    return ((example[0], example[0]), example[1])


pb1 = mlpbgen.MLProblem(raw_data, metadata)
print 'pb1:'
for example in pb1:
    print example
print 'metadata:', pb1.metadata
print ''

pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
print 'pb2:'
for example in pb2:
    print example
print 'metadata:', pb2.metadata
print ''

pb3 = mlpbgen.MergedProblem([pb2, pb1])
print 'pb3:'
Exemple #10
0
def test_mlproblem_combinations():
    """
    Test a combination of many different MLProblems.
    """

    raw_data = zip(range(6), ['A', 'A', 'B', 'C', 'A', 'B'])
    metadata = {'length': 6, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    def features(example, metadata):
        metadata['input_size'] = 2
        return ((example[0], example[0]), example[1])

    pb1 = mlpbgen.MLProblem(raw_data, metadata)
    print 'pb1', pb1.metadata
    pb2 = mlpbgen.SubsetProblem(pb1, subset=set([1, 3, 5]))
    print 'pb2', pb2.metadata
    pb3 = mlpbgen.MergedProblem([pb2, pb1])
    print 'pb3', pb3.metadata
    pb4 = mlpbgen.PreprocessedProblem(pb3, preprocess=features)
    print 'pb4', pb4.metadata
    pb5 = mlpbclass.ClassificationProblem(pb4)
    print 'pb5', pb5.metadata
    pb6 = mlpbclass.ClassSubsetProblem(pb5, subset=set(['A', 'C']))
    print 'pb6', pb6.metadata
    pb7 = mlpbgen.SubsetFieldsProblem(pb6, fields=[0, 0, 1])
    print 'pb7', pb7.metadata

    final_data = [[(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(0, 0), (0, 0),
                                                             0],
                  [(1, 1), (1, 1), 0], [(3, 3), (3, 3), 1], [(4, 4), (4, 4),
                                                             0]]
    final_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pb7, final_data):
        assert cmp(ex1, ex2) == 0
    print pb7.metadata, final_metadata
    assert cmp(pb7.metadata, final_metadata) == 0

    raw_data2 = zip(range(6, 10), ['C', 'B', 'A', 'C'])
    metadata2 = {'length': 4, 'targets': ['A', 'B', 'C'], 'input_size': 1}

    pbtest = pb7.apply_on(raw_data2, metadata2)
    final_test_data = [[(6, 6), (6, 6), 1], [(8, 8), (8, 8), 0],
                       [(9, 9), (9, 9), 1]]
    final_test_metadata = {
        'input_size': 2,
        'targets': set(['A', 'C']),
        'class_to_id': {
            'A': 0,
            'C': 1
        }
    }

    for ex1, ex2 in zip(pbtest, final_test_data):
        assert cmp(ex1, ex2) == 0
    assert cmp(pbtest.metadata, final_test_metadata) == 0