Exemple #1
0
	def __init__(self):
		self.name = self.__class__.__name__
		
		#Symbolic expressions for the prediction function (and compiled one too), the loss, the regularization, and the loss to
		#optimize (loss + lmbda * regul)
		#To be defined by the child classes:
		self.pred_func = None
		self.pred_func_compiled = None

		self.loss_func = None
		self.regul_func = None
		self.loss_to_opt = None
		
		#Symbolic variables for training values
		self.ys = TT.vector('ys')
		self.rows = TT.lvector('rows')
		self.cols = TT.lvector('cols')
		self.tubes = TT.lvector('tubes') 


		#Current values for which the loss is currently compiled
		#3 dimensions:
		self.n = 0 #Number of subject entities
		self.m = 0 #Number of relations
		self.l = 0 #Number of object entities
		#and rank:
		self.k = 0
		#and corresponding number of parameters (i.e. n*k + m*k + l*k for CP_Model)
		self.nb_params = 0
    def recognize_dataset(self, dataset_test = None, seqlen=None, batch_size =100) :
        #Here, we don't ignore the 6 first frames cause we want to test recognition performance on each frames of the test dataset
        n_test_batches = (dataset_test.get_value(borrow=True).shape[0]-(self.delay*self.freq)*len(seqlen)) / batch_size
        n_dim =  dataset_test.get_value(borrow=True).shape[1]

        # allocate symbolic variables for the data
        index = T.lvector()    # index to a [mini]batch
        index_hist = T.lvector()  # index to history

        input_log = T.nnet.sigmoid(T.dot(self.x, self.crbm_layer.W)+ T.dot(self.x_history, self.crbm_layer.B) + self.crbm_layer.hbias)
        prob = self.logLayer.p_y_given_x
        prediction = self.logLayer.y_pred

        print_prediction = theano.function(
            [index, index_hist],
            [prediction, prob],
            givens={ self.x:dataset_test[index],
                     self.x_history:dataset_test[index_hist].reshape((batch_size, self.delay * n_dim))
                    },
            name='print_prediction'
            )
        # valid starting indices
        datasetindex = range(self.delay*self.freq, dataset_test.get_value(borrow=True).shape[0])
        permindex = np.array(datasetindex)

        for batch_index in xrange(n_test_batches):
            data_idx = permindex[batch_index * batch_size : (batch_index + 1) * batch_size]
            hist_idx = np.array([data_idx - n*self.freq for n in xrange(1, self.delay + 1)]).T
            for index_in_batch in range(batch_size) :
                print "(frame %d):" %(batch_index*batch_size+index_in_batch+1)
                print "%% of recognition for each pattern : "
                print print_prediction(data_idx, hist_idx.ravel())[1][index_in_batch]
                print "So, recognized pattern is :"
                print print_prediction(data_idx, hist_idx.ravel())[0][index_in_batch]
                print "-----------"
Exemple #3
0
  def __theano_init__(self):

    # Theano tensor for I/O 
    X = T.lmatrix('X')
    Y = T.lvector('Y')
    N = T.lvector('N')

    # network structure
    l_in = L.layers.InputLayer(shape=(self.batch_size, self.n_gram), input_var = X)
    l_we = L.layers.EmbeddingLayer(l_in, self.vocab_size, self.word_dim, W = self.D)
    l_f1 = L.layers.DenseLayer(l_we, self.hidden_dim1, W = self.C, b = self.Cb)
    l_f2 = L.layers.DenseLayer(l_f1, self.hidden_dim2, W = self.M, b = self.Mb)
    l_out = L.layers.DenseLayer(l_f2, self.vocab_size, W = self.E, b = self.Eb, nonlinearity=None)
    
    # lasagne.layers.get_output produces a variable for the output of the net
    O = L.layers.get_output(l_out) # (batch_size, vocab_size)

    lossfunc = NCE(self.batch_size, self.vocab_size, self.noise_dist, self.noise_sample_size)
    loss = lossfunc.evaluate(O, Y, N)
    # loss = T.nnet.categorical_crossentropy(O, Y).mean()

    # Retrieve all parameters from the network
    all_params = L.layers.get_all_params(l_out, trainable=True)

    # Compute AdaGrad updates for training
    updates = L.updates.adadelta(loss, all_params)

    # Theano functions for training and computing cost
    self.train = theano.function([l_in.input_var, Y, N], loss, updates=updates, allow_input_downcast=True)
    self.compute_loss = theano.function([l_in.input_var, Y, N], loss, allow_input_downcast=True)
    self.weights = theano.function(inputs = [], outputs = [self.D, self.C, self.M, self.E, self.Cb, self.Mb, self.Eb])
Exemple #4
0
    def train_minibatch_fn(self, evaluate=False):
        """
        Initialize this Theano function once
        """
        X = T.lmatrix('X_train')
        L_x = T.lvector('L_X_train')

        Y = T.lmatrix('Y_train')
        L_y = T.lvector('L_y_train')

        learning_rate = T.dscalar('learning_rate')
        momentum = T.dscalar('momentum')
        weight_decay = T.dscalar('weight_decay')

        loss, accuracy = self.loss(X, L_x, Y, L_y, weight_decay)
        updates = self.get_sgd_updates(loss, learning_rate, momentum)

        outputs = [loss, accuracy]

        if evaluate:
            precision, recall = self.evaluate(X, L_x, Y, L_y)
            outputs = outputs + [precision, recall]

        return theano.function(
            inputs=[X, L_x, Y, L_y, learning_rate, momentum, weight_decay],
            outputs=outputs,
            updates=updates
        )
Exemple #5
0
    def test_random_integers_vector(self):
        rng_R = random_state_type()
        low = tensor.lvector()
        high = tensor.lvector()
        post_r, out = random_integers(rng_R, low=low, high=high)
        assert out.ndim == 1
        f = compile.function([rng_R, low, high], [post_r, out],
                             accept_inplace=True)

        low_val = [100, 200, 300]
        high_val = [110, 220, 330]
        rng = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(utt.fetch_seed())

        # Arguments of size (3,)
        rng0, val0 = f(rng, low_val, high_val)
        numpy_val0 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        rng1, val1 = f(rng0, low_val[:-1], high_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv)
            for lv, hv in zip(low_val[:-1], high_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = compile.function([rng_R, low, high],
                random_integers(rng_R, low=low, high=high, size=(3,)),
                accept_inplace=True)
        rng2, val2 = g(rng1, low_val, high_val)
        numpy_val2 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, rng2, low_val[:-1], high_val[:-1])
    def test_random_integers_vector(self):
        random = RandomStreams(utt.fetch_seed())
        low = tensor.lvector()
        high = tensor.lvector()
        out = random.random_integers(low=low, high=high)
        assert out.ndim == 1
        f = function([low, high], out)

        low_val = [100, 200, 300]
        high_val = [110, 220, 330]
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(low_val, high_val)
        numpy_val0 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(low_val[:-1], high_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val[:-1], high_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([low, high], random.random_integers(low=low, high=high, size=(3,)))
        val2 = g(low_val, high_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1)
            for lv, hv in zip(low_val, high_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
Exemple #7
0
    def predict_hidden(self, dataset=None, batch_size=100):
        # compute number of minibatches for training, validation and testing
        n_test_batches = dataset.get_value(borrow=True).shape[0] / batch_size
        n_dim =  dataset.get_value(borrow=True).shape[1]

        # allocate symbolic variables for the data
        index = T.lvector()    # index to a [mini]batch
        index_hist = T.lvector()  # index to history

        #print hidden layer
        [pre_sigmoid_h1, h1_mean, h1_sample] = self.sample_h_given_v(self.input, self.input_history)

        print_hidden = theano.function(
            [index, index_hist],
            [h1_sample],
            givens={ self.input:dataset[index],
                     self.input_history:dataset[index_hist].reshape((batch_size, self.delay * self.n_visible))
                    },
            name='print_hidden'
            )

        # valid starting indices
        datasetindex = range(self.delay, dataset.get_value(borrow=True).shape[0])
        permindex = np.array(datasetindex)

        #For each frame in minibatch
        for batch_index in xrange(n_test_batches):
            data_idx = permindex[batch_index * batch_size : (batch_index + 1) * batch_size]
            hist_idx = np.array([data_idx - n for n in xrange(1, self.delay + 1)]).T
            for index_in_batch in range(batch_size) :
                print "Hidden CRBM (frame %d):" %(batch_index*batch_size+index_in_batch+1)
                print print_hidden(data_idx, hist_idx.ravel())[0][index_in_batch]
                print "-----------"
Exemple #8
0
    def pretraining_functions(self, train_set_x, batch_size, k, layer=0, static=False, with_W=False, binary=False):
        """Creates functions for doing CD

        Generates a function for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        Args:
            train_set_x: Shared var. that contains all datapoints used
                                for training the RBM
            batch_size: int, the size of each minibatch
            k: number of Gibbs steps to do in CD-k / PCD-k
            layer: which layer of the dbn to generate functions for
            static: if True, ignore all temporal components
            with_W: Whether or not to include the W in update
            binary: if true, make visible layer binary
        Returns:
            CD function

        """
        # allocate symbolic variables for the data
        index = T.lvector()  # index to a [mini]batch
        index_hist = T.lvector()  # index to history
        lr = T.dscalar()

        rbm = self.rbm_layers[layer]
        rbm.binary = binary
        # get the cost and the gradient corresponding to one step of CD-15
        cost, updates = rbm.get_cost_updates(k=k, static=static, with_W=with_W)

        #################################
        #     Training the RBM         #
        #################################
        if static:
            # updates only on non-temporal components
            fn = theano.function(
                [index, lr],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[index], self.lr: lr},
                name="train_tarbm_static",
            )
        else:
            # updates including temporal components
            fn = theano.function(
                [index, index_hist, lr],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[index],
                    self.x_hist: train_set_x[index_hist].reshape((batch_size, self.delay * np.prod(self.n_ins))),
                    self.lr: lr,
                },
                name="train_tarbm",
            )
        return fn
Exemple #9
0
def test(model):
    dim = 128
    v_size = 7810
    margin = 1.0
    
    #load model
    f = open(model, 'rb')
    input_params = cPickle.load(f)
    emb, wx, wh, bh, wa = input_params
    f.close()
    
    embLayer = emb_layer(pre_train=emb, v = v_size, dim = dim) 
    rnnLayer = rnn_layer(input=None, wx=wx, wh=wh, bh=bh, emb_layer = embLayer, nh = dim) 
    att = attention_layer(input=None, rnn_layer=rnnLayer, margin = margin)

    q = T.lvector('q')
    a = T.lscalar('a')
    p = T.lvector('p')
    t = T.lscalar('t')
    inputs = [q,a,p,t]
    score = att.predict(inputs)
    pred = theano.function(inputs=inputs,outputs=score)

    pool = ThreadPool()

    f = open('./data/test-small.id','r')
    count = 1
    print 'time_b:%s' %time.clock()  
    to_pred = []
    for line in f:
        if count % 10000 == 0:
	    print count / 10000
	count += 1
        #print 'time_b:%s' %time.clock()  
        line = line[:-1]
        tmp = line.split('\t')
        in_q = numpy.array(tmp[0].split(' ')).astype(numpy.int) - 1
        in_a = int(tmp[1].split(' ')[2]) - 1
        in_p = numpy.array(tmp[1].split(' ')).astype(numpy.int) - 1
        in_t = int(tmp[2]) - 1
	lis = (in_q, in_a, in_p, in_t)
	to_pred.append(lis)
        #print 'time_load:%s' %time.clock()  
        #print 'time_score:%s' %time.clock()  
    f.close()

    ay = numpy.asarray(to_pred)
    #results = map(pred, list(ay[:,0]), list(ay[:,1]),list(ay[:,2]),list(ay[:,3]))
    results = pool.map(pred, to_pred)
    #results = []
    #for p in to_pred:
    #    results.append(att.predict(p,params))
    print 'time_e:%s' %time.clock()
    #print results
    pool.close()
    pool.join()
Exemple #10
0
def test(model):
    dim = 128
    v_size = 7810
    margin = 1.0
    
    #load model
    f = open(model, 'rb')
    input_params = cPickle.load(f)
    emb, wx, wh, bh, wa = input_params
    f.close()
    
    embLayer = emb_layer(pre_train=emb, v = v_size, dim = dim) 
    rnnLayer = rnn_layer(input=None, wx=wx, wh=wh, bh=bh, emb_layer = embLayer, nh = dim) 
    att = attention_layer(input=None, rnn_layer=rnnLayer, margin = margin)

    q = T.lvector('q')
    a = T.lscalar('a')
    p = T.lvector('p')
    t = T.lscalar('t')
    inputs = [q,a,p,t]

    #emb_num = T.lscalar('emb_num')
    #nh = T.scalar('nh')
    #dim = T.scalar('dim')
    score = att.predict(inputs)
    pred = theano.function(inputs=inputs,outputs=score)

    

    wf = open('./data/res','w')
    f = open('./data/test.id','r')
    count = 1
    print 'time_b:%s' %time.clock()  
    for line in f:
        if count % 10000 == 0:
	    print count / 10000
            print 'time_1w:%s' %time.clock()  
	count += 1
        #print 'time_b:%s' %time.clock()  
        line = line[:-1]
        tmp = line.split('\t')
        in_q = numpy.array(tmp[0].split(' ')).astype(numpy.int) - 1
        #x = emb[q].reshape((q.shape[0], emb.shape[1]))
        in_a = int(tmp[1].split(' ')[2]) - 1
        in_p = numpy.array(tmp[1].split(' ')).astype(numpy.int) - 1
        in_t = int(tmp[2]) - 1
        #in_lis =  [in_q, in_a, in_p, in_t]
        #print 'time_load:%s' %time.clock()  
	s = pred(in_q, in_a, in_p, in_t)
	#print s
        wf.write(str(s) + '\n')
        #print 'time_score:%s' %time.clock()  
    f.close()
    wf.close()
Exemple #11
0
def test_no_reuse():
    x = T.lvector()
    y = T.lvector()
    f = theano.function([x, y], x + y)

    #provide both inputs in the first call
    f(numpy.ones(10, dtype='int64'), numpy.ones(10, dtype='int64'))

    try:
        f(numpy.ones(10))
    except TypeError:
        return
    assert not 'should not get here'
def run_mlp(train_data, valid_data, valid_score, test_data, test_score, We_init, options):

    tmp = np.diag(np.ones(options.dim, dtype='float32'))
    W_init = np.asarray(np.concatenate((tmp, tmp), axis=0))

    g1batchindices = T.lvector(); g2batchindices = T.lvector()
    p1batchindices = T.lvector(); p2batchindices = T.lvector()

    # Create an instance of the MLP class
    mlp = Layer(We_init, W_init, T.tanh,  options.lamda_w, options.lamda_ww)

    #compute phrase vectors
    bigram_output = theano.function([g1batchindices, g2batchindices], mlp.output(g1batchindices, g2batchindices))

    cost = squared_error(mlp, g1batchindices, g2batchindices, p1batchindices, p2batchindices)

    cost = cost + mlp.word_reg

    updates = adagrad(cost, mlp.params, learning_rate=0.005, epsilon=1e-6)

    train_model = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices], cost, updates=updates)

    # compute number of minibatches for training
    batch_size = int(options.batchsize)
    n_train_batches = int(len(train_data) * 1.0 // batch_size)

    iteration = 0

    max_iteration = options.epochs

    while iteration < max_iteration:
        iteration += 1

        seed = range(len(train_data))
        random.shuffle(seed)
        train_data = [train_data[i] for i in seed]

        score = valid_model(bigram_output, valid_data, valid_score)

        accuary = test_model(bigram_output, test_data, test_score)

        print "iteration: {0}   valid_score: {1}   test_score: {2}".format(iteration, score[0], accuary[0])

        for minibatch_index in range(n_train_batches):

            train_data_batch = train_data[minibatch_index * batch_size : (minibatch_index + 1) * batch_size]
            train_data_batch_x1 = [i[0][0] for i in train_data_batch]
            train_data_batch_x2 = [i[0][1] for i in train_data_batch]
            train_data_batch_y1 = [i[1][0] for i in train_data_batch]
            train_data_batch_y2 = [i[1][1] for i in train_data_batch]
            train_model(train_data_batch_x1, train_data_batch_x2, train_data_batch_y1, train_data_batch_y2)
Exemple #13
0
    def propup(self, data, layer=0, static=False):
        """
        propogate the activity through layer 0 to the hidden layer and return
        an array of [2, samples, dimensions]
        where the first 2 dimensions are
        [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
        so far only works for the first rbm layer
        """
        if not isinstance(data, theano.tensor.sharedvar.TensorSharedVariable):
            data = theano.shared(data)

        # allocate symbolic variables for the data
        index = T.lvector()  # index to a [mini]batch
        index_hist = T.lvector()  # index to history
        rbm = self.rbm_layers[layer]

        #################################
        #     Training the CRBM         #
        #################################
        # get the cost and the gradient corresponding to one step of CD-15
        [pre_sig, post_sig] = rbm.propup(static)

        if static:
            # the purpose of train_crbm is solely to update the CRBM parameters
            fn = theano.function([], outputs=[pre_sig, post_sig], givens={self.x: data}, name="propup_tarbm_static")
            return np.array(fn())

        else:
            # indexing is slightly complicated
            # build a linear index to the starting frames for this batch
            # (i.e. time t) gives a batch_size length array for data
            data_idx = np.arange(self.delay, data.get_value(borrow=True).shape[0])

            # now build a linear index to the frames at each delay tap
            # (i.e. time t-1 to t-delay)
            # gives a batch_size x delay array of indices for history
            hist_idx = np.array([data_idx - n for n in xrange(1, self.delay + 1)]).T

            # the purpose of train_crbm is solely to update the CRBM parameters
            fn = theano.function(
                [index, index_hist],
                outputs=[pre_sig, post_sig],
                givens={
                    self.x: data[index],
                    self.x_hist: data[index_hist].reshape((len(data_idx), self.delay * np.prod(self.n_ins))),
                },
                name="train_tarbm",
            )

            return np.array(fn(data_idx, hist_idx.ravel()))
Exemple #14
0
    def _compile_bp(self):
        '''
        compile backpropagation foreach of the dqns.
        '''
        self.bprop_by_goal = {}
        for (goal, dqn) in self.dqn_by_goal.items():
            states = dqn.states
            action_values = dqn.action_values
            params = dqn.params
            targets = T.vector('target')
            last_actions = T.lvector('action')

            # loss function.
            mse = layers.MSE(action_values[T.arange(action_values.shape[0]),
                                last_actions], targets)
            # l2 penalty.
            l2_penalty = 0.
            for param in params:
                l2_penalty += (param ** 2).sum()

            cost = mse + self.l2_reg * l2_penalty

            # back propagation.
            updates = optimizers.Adam(cost, params, alpha=self.lr)

            td_errors = T.sqrt(mse)
            self.bprop_by_goal[goal] = theano.function(inputs=[states, last_actions, targets],
                                        outputs=td_errors, updates=updates)
Exemple #15
0
    def test_softmax_optimizations_w_bias2(self):
        x = tensor.matrix('x')
        b = tensor.vector('b')
        c = tensor.vector('c')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot

        env = gof.Env(
                [x, b, c, one_of_n],
                [op(softmax(T.add(x,b,c)), one_of_n)])
        assert env.outputs[0].owner.op == op

        print 'BEFORE'
        for node in env.toposort():
            print node.op
        print '----'

        theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(env)

        print 'AFTER'
        for node in env.toposort():
            print node.op
        print '===='
        assert len(env.toposort()) == 3

        assert str(env.outputs[0].owner.op) == 'OutputGuard'
        assert env.outputs[0].owner.inputs[0].owner.op == crossentropy_softmax_argmax_1hot_with_bias
Exemple #16
0
    def test_binomial_vector(self):
        rng_R = random_state_type()
        n = tensor.lvector()
        prob = tensor.vector()
        post_r, out = binomial(rng_R, n=n, p=prob)
        assert out.ndim == 1
        f = compile.function([rng_R, n, prob], [post_r, out],
                             accept_inplace=True)

        n_val = [1, 2, 3]
        prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX)
        rng = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(utt.fetch_seed())

        # Arguments of size (3,)
        rng0, val0 = f(rng, n_val, prob_val)
        numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val)
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        rng1, val1 = f(rng0, n_val[:-1], prob_val[:-1])
        numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = compile.function([rng_R, n, prob],
                binomial(rng_R, n=n, p=prob, size=(3,)),
                accept_inplace=True)
        rng2, val2 = g(rng1, n_val, prob_val)
        numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,))
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, rng2, n_val[:-1], prob_val[:-1])
    def GRU_question(self, dimension_fact_embedding, num_hidden_units_questions, num_hidden_units_episodes, max_question_len, dimension_word_embeddings):

        self.question_idxs = T.lmatrix("question_indices") # as many columns as words in the context window and as many lines as words in the sentence
        self.question_mask = T.lvector("question_mask")
        q = self.emb[self.question_idxs].reshape((self.question_idxs.shape[0], dimension_word_embeddings)) # x basically represents the embeddings of the words IN the current sentence.  So it is shape

        def slice_w(x, n):
            return x[n*num_hidden_units_questions:(n+1)*num_hidden_units_questions]

        def question_gru_recursion(x_cur, h_prev, q_mask):

            W_in_stacked = T.concatenate([self.W_question_reset_gate_x, self.W_question_update_gate_x, self.W_question_hidden_gate_x], axis=1)
            W_hid_stacked = T.concatenate([self.W_question_reset_gate_h, self.W_question_update_gate_h, self.W_question_hidden_gate_h], axis=1)

            input_n = T.dot(x_cur, W_in_stacked)
            hid_input = T.dot(h_prev, W_hid_stacked)
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = T.tanh(resetgate)
            updategate = T.tanh(updategate)

            hidden_update = slice_w(input_n, 2) + resetgate * slice_w(hid_input, 2)
            hidden_update = T.tanh(hidden_update)
            h_cur = (1 - updategate) * hidden_update + updategate * hidden_update

            h_cur = q_mask * h_cur + (1 - q_mask) * h_prev
            # h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev))
            return h_cur

        state = self.h0_questions
        for jdx in range(max_question_len):
            state = question_gru_recursion(q[jdx], state, self.question_mask[jdx])

        return T.tanh(T.dot(state, self.W_question_to_vector) + self.b_question_to_vector)
Exemple #18
0
    def test_multinomial_vector(self):
        rng_R = random_state_type()
        n = tensor.lvector()
        pvals = tensor.matrix()
        post_r, out = multinomial(rng_R, n=n, pvals=pvals)
        assert out.ndim == 2
        f = compile.function([rng_R, n, pvals], [post_r, out], accept_inplace=True)

        n_val = [1, 2, 3]
        pvals_val = [[0.1, 0.9], [0.2, 0.8], [0.3, 0.7]]
        pvals_val = numpy.asarray(pvals_val, dtype=config.floatX)
        rng = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(utt.fetch_seed())

        # Arguments of size (3,)
        rng0, val0 = f(rng, n_val, pvals_val)
        numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        rng1, val1 = f(rng0, n_val[:-1], pvals_val[:-1])
        numpy_val1 = numpy.asarray(
            [numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val[:-1], pvals_val[:-1])]
        )
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = compile.function([rng_R, n, pvals], multinomial(rng_R, n=n, pvals=pvals, size=(3,)), accept_inplace=True)
        rng2, val2 = g(rng1, n_val, pvals_val)
        numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv) for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, rng2, n_val[:-1], pvals_val[:-1])
Exemple #19
0
    def test_optimize_xent_vector2(self):
        verbose = 0
        mode = theano.compile.mode.get_default_mode()
        if mode == theano.compile.mode.get_mode('FAST_COMPILE'):
            mode = 'FAST_RUN'
        rng = numpy.random.RandomState(utt.fetch_seed())
        x_val = rng.randn(5)
        b_val = rng.randn(5)
        y_val = numpy.asarray([2])

        x = T.dvector('x')
        b = T.dvector('b')
        y = T.lvector('y')

        def print_graph(func):
            for i, node in enumerate(func.maker.fgraph.toposort()):
                print i, node
            # Last node should be the output
            print i, printing.pprint(node.outputs[0])
            print

        ## Test that a biased softmax is optimized correctly
        bias_expressions = [
                T.sum(-T.log(softmax(x + b)[T.arange(y.shape[0]), y])),
                -T.sum(T.log(softmax(b + x)[T.arange(y.shape[0]), y])),
                -T.sum(T.log(softmax(x + b))[T.arange(y.shape[0]), y]),
                T.sum(-T.log(softmax(b + x))[T.arange(y.shape[0]), y])]

        for expr in bias_expressions:
            f = theano.function([x, b, y], expr, mode=mode)
            if verbose:
                print_graph(f)
            try:
                prev, last = f.maker.fgraph.toposort()[-2:]
                assert len(f.maker.fgraph.toposort()) == 3
                # [big_op, sum, dim_shuffle]
                f(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(f)
                raise

            backup = config.warn.sum_div_dimshuffle_bug
            config.warn.sum_div_dimshuffle_bug = False
            try:
                g = theano.function([x, b, y], T.grad(expr, x), mode=mode)
            finally:
                config.warn.sum_div_dimshuffle_bug = backup

            if verbose:
                print_graph(g)
            try:
                ops = [node.op for node in g.maker.fgraph.toposort()]
                assert len(ops) <= 6
                assert crossentropy_softmax_1hot_with_bias_dx in ops
                assert softmax_with_bias in ops
                assert softmax_grad not in ops
                g(x_val, b_val, y_val)
            except Exception:
                theano.printing.debugprint(g)
                raise
Exemple #20
0
    def __init__(self, model):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.prms = model['network']['graph']
        self.N = model['N']

        # SBM has R latent clusters
        self.R = self.prms['R']
        # A RxR matrix of connection probabilities per pair of clusters
        self.B = T.dmatrix('B')
        # SBM has a latent block or cluster assignment for each node
        self.Y = T.lvector('Y')
        # For indexing, we also need Y as a column vector and tiled matrix
        self.Yv = T.reshape(self.Y, [self.N, 1])
        self.Ym = T.tile(self.Yv, [1, self.N])
        self.pA = self.B[self.Ym, T.transpose(self.Ym)]

        # A probability of each cluster
        self.alpha = T.dvector('alpha')

        # Hyperparameters governing B and alpha
        self.b0 = self.prms['b0']
        self.b1 = self.prms['b1']
        self.alpha0 = self.prms['alpha0']

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Define log probability
        log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B))
        log_p_alpha = T.sum((self.alpha0 - 1) * T.log(self.alpha))
        log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA))

        self.log_p = log_p_B + log_p_alpha + log_p_A
    def test_multinomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        pvals = tensor.matrix()
        out = random.multinomial(n=n, pvals=pvals)
        assert out.ndim == 2
        f = function([n, pvals], out)

        n_val = [1, 2, 3]
        pvals_val = [[.1, .9], [.2, .8], [.3, .7]]
        pvals_val = numpy.asarray(pvals_val, dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, pvals_val)
        numpy_val0 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], pvals_val[:-1])
        numpy_val1 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val[:-1], pvals_val[:-1])])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, pvals], random.multinomial(n=n, pvals=pvals, size=(3,)))
        val2 = g(n_val, pvals_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy.asarray([numpy_rng.multinomial(n=nv, pvals=pv)
            for nv, pv in zip(n_val, pvals_val)])
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], pvals_val[:-1])
Exemple #22
0
 def test_illegal_things(self):
     i0 = TT.iscalar()
     i1 = TT.lvector()
     i2 = TT.bmatrix()
     self.failUnlessRaises(TypeError, FAS, [i1, slice(None, i2, -1), i0])
     self.failUnlessRaises(TypeError, FAS, [i1, slice(None, None, i2), i0])
     self.failUnlessRaises(TypeError, FAS, [i1, slice(i2, None, -1), i0])
    def test_binomial_vector(self):
        random = RandomStreams(utt.fetch_seed())
        n = tensor.lvector()
        prob = tensor.vector()
        out = random.binomial(n=n, p=prob)
        assert out.ndim == 1
        f = function([n, prob], out)

        n_val = [1, 2, 3]
        prob_val = numpy.asarray([.1, .2, .3], dtype=config.floatX)
        seed_gen = numpy.random.RandomState(utt.fetch_seed())
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))

        # Arguments of size (3,)
        val0 = f(n_val, prob_val)
        numpy_val0 = numpy_rng.binomial(n=n_val, p=prob_val)
        assert numpy.all(val0 == numpy_val0)

        # arguments of size (2,)
        val1 = f(n_val[:-1], prob_val[:-1])
        numpy_val1 = numpy_rng.binomial(n=n_val[:-1], p=prob_val[:-1])
        assert numpy.all(val1 == numpy_val1)

        # Specifying the size explicitly
        g = function([n, prob], random.binomial(n=n, p=prob, size=(3,)))
        val2 = g(n_val, prob_val)
        numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30)))
        numpy_val2 = numpy_rng.binomial(n=n_val, p=prob_val, size=(3,))
        assert numpy.all(val2 == numpy_val2)
        self.assertRaises(ValueError, g, n_val[:-1], prob_val[:-1])
Exemple #24
0
def test_bug_2009_06_02_trac_387():
    y = tensor.lvector('y')
    f = theano.function([y],
            tensor.int_div(
                tensor.DimShuffle(y[0].broadcastable, ['x'])(y[0]), 2))
    sys.stdout.flush()
    print(f(numpy.ones(1, dtype='int64') * 3))
Exemple #25
0
def train_rnn():
    rng = numpy.random.RandomState(1234)

    q = T.lvector("q")
    pos = T.lscalar("pos")
    neg = T.lscalar("neg")
    inputs = [q, pos, neg]

    embLayer = emb_layer(None, 100, 5)
    rnn = rnn_layer(input=inputs, emb_layer=embLayer, nh=5)

    cost = rnn.loss()
    gradient = T.grad(cost, rnn.params)
    lr = 0.001
    updates = OrderedDict((p, p - lr * g) for p, g in zip(rnn.params, gradient))
    train = theano.function(inputs=[q, pos, neg], outputs=cost, updates=updates)

    print rnn.emb.eval()[0]
    e0 = rnn.emb.eval()

    for i in range(0, 3):
        idq = rng.randint(size=10, low=0, high=100)
        idpos = rng.random_integers(100)
        idneg = rng.random_integers(100)

        train(idq, idpos, idneg)
        rnn.normalize()

        print rnn.emb.eval() - e0
Exemple #26
0
    def test_grad_types(self):
        # This function simply tests the behaviour of the AbstractConv
        # Ops, not their optimizations
        cpu_input = tensor.ftensor4()
        cpu_filters = tensor.ftensor4()
        cpu_topgrad = tensor.ftensor4()
        gpu_input = gpu_ftensor4()
        gpu_filters = gpu_ftensor4()
        gpu_topgrad = gpu_ftensor4()

        out_shape = tensor.lvector()

        # Check the gradient of the forward conv2d
        for input, filters in itertools.product((cpu_input, gpu_input), (cpu_filters, gpu_filters)):
            output = conv.conv2d(input, filters)
            grad_input, grad_filters = theano.grad(output.sum(), wrt=(input, filters))
            assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type)
            assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type)

        # Check the gradient of gradweight
        for input, topgrad in itertools.product((cpu_input, gpu_input), (cpu_topgrad, gpu_topgrad)):
            grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape)
            grad_input, grad_topgrad = theano.grad(grad_filters.sum(), wrt=(input, topgrad))

            assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type)
            assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)

        # Check the gradient of gradinputs
        for filters, topgrad in itertools.product((cpu_filters, gpu_filters), (cpu_topgrad, gpu_topgrad)):
            grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape)
            grad_filters, grad_topgrad = theano.grad(grad_input.sum(), wrt=(filters, topgrad))

            assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type)
            assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)
    def __init__(self, config=None, defaults=defaults, inputs_hook=None, hiddens_hook=None, params_hook=None,
                 use_data_layer=None, rand_crop=None, batch_size=None):
        # combine everything by passing to Model's init
        super(AlexNet, self).__init__(**{arg: val for (arg, val) in locals().iteritems() if arg is not 'self'})
        # configs can now be accessed through self dictionary

        if self.inputs_hook or self.hiddens_hook or self.params_hook:
            log.error("Inputs_hook, hiddens_hook, and params_hook not implemented yet for AlexNet!")

        self.flag_datalayer = self.use_data_layer

        ####################
        # Theano variables #
        ####################
        # allocate symbolic variables for the data
        # 'rand' is a random array used for random cropping/mirroring of data
        self.x = T.ftensor4('x')
        self.y = T.lvector('y')
        self.rand = T.fvector('rand')

        ##########
        # params #
        ##########
        self.params = []

        # make the network!
        self.build_computation_graph()
def train_dA(lr=0.1, training_epochs=15, params_dict = False, print_every = 100,
            data=None):

    x = T.lvector('x')
    input_size = T.scalar(dtype='int64')
    dA = make_dA(params=params_dict, input_size=input_size, data=x)
    cost, updates, output = dA.get_cost_updates(lr=lr)

    model = theano.function(
        [x],
        [cost, output],
        updates=updates,
        givens={input_size: x.shape[0]}
    )

    start_time = time.clock()
    for epoch in xrange(training_epochs):
        cost_history = []
        for index in range(len(data)):
            cost, predict= model(data[index])
            cost_history.append(cost)
            if index % print_every == 0:
                print 'Iteration %d, cost %f' % (index, cost)
                print predict
        print 'Training epoch %d, cost ' % epoch, numpy.mean(cost_history)

    training_time = (time.clock() - start_time)

    print 'Finished training %d epochs, took %d seconds' % (training_epochs, training_time)

    return cost_history, dA.get_params(), model
Exemple #29
0
    def test_softmax_optimizations_w_bias_vector(self):
        x = tensor.vector('x')
        b = tensor.vector('b')
        one_of_n = tensor.lvector('one_of_n')
        op = crossentropy_categorical_1hot
        fgraph = gof.FunctionGraph(
                [x, b, one_of_n],
                [op(softmax(x + b), one_of_n)])
        assert fgraph.outputs[0].owner.op == op
        #print 'BEFORE'
        #for node in fgraph.toposort():
        #    print node.op
        #print printing.pprint(node.outputs[0])
        #print '----'

        theano.compile.mode.optdb.query(
                theano.compile.mode.OPT_FAST_RUN).optimize(fgraph)
        #print 'AFTER'
        #for node in fgraph.toposort():
        #    print node.op
        #print '===='
        assert len(fgraph.toposort()) == 3
        assert str(fgraph.outputs[0].owner.op) == 'OutputGuard'
        assert (fgraph.outputs[0].owner.inputs[0].owner.op ==
                crossentropy_softmax_argmax_1hot_with_bias)
Exemple #30
0
    def __init__(self, height, width, channels, timesteps, hidden_size, output_size, batch_size, num_convpools= 2, num_filters= 5):
        NeuralNet.__init__(self)
        self.batch_size = batch_size

        tensor5 = T.TensorType('float64', [False]*5)
        self.x1 = tensor5('inputs')
        self.y = T.lvector('targets')
        self.layers = []

        n_batch, n_steps, n_channels, width, height = (batch_size, timesteps, channels, width, height)
        n_out_filters = 7
        filter_shape = (3, 3)

        l_in = lasagne.layers.InputLayer(
             (None, n_steps, n_channels, width, height),
             input_var= self.x1)
        l_in_to_hid = lasagne.layers.Conv2DLayer(
             lasagne.layers.InputLayer((None, n_channels, width, height)),
             n_out_filters, filter_shape, pad='same')
        l_hid_to_hid = lasagne.layers.Conv2DLayer(
             lasagne.layers.InputLayer(l_in_to_hid.output_shape),
             n_out_filters, filter_shape, pad='same')
        l_rec = lasagne.layers.CustomRecurrentLayer(
             l_in, l_in_to_hid, l_hid_to_hid)

        l_reshape = lasagne.layers.ReshapeLayer(l_rec, (-1, np.prod(l_rec.output_shape[2:])))

        l_out = lasagne.layers.DenseLayer(
            l_reshape, num_units= output_size,
            nonlinearity=lasagne.nonlinearities.linear)

        self.layers = [l_in, l_rec, l_out]
        self.network = l_out

        self.initiliaze(mode= 'classify')
Exemple #31
0
def optimization_sgd(trainvec,
                     testvec,
                     n_epochs,
                     batch_size,
                     alpha=0.01,
                     beta=0.05):
    i = T.lvector('i')
    j = T.lvector('j')
    x = T.dvector('x')
    num_user = 6040
    num_item = 3952
    factors = 20
    init_mean = 0
    init_stdev = 0.02
    mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev)
    regcost, error = mfobj.errors(x, beta)
    gp, gq = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q])
    updates = [(mfobj.P, T.inc_subtensor(mfobj.P[i, :], -gp[i, :] * alpha)),
               (mfobj.Q, T.inc_subtensor(mfobj.Q[j, :], -gq[j, :] * alpha))]
    train_model = theano.function(
        inputs=[i, j, x],
        #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]],
        outputs=regcost,
        updates=updates)

    test_model = theano.function(
        inputs=[i, j, x],
        #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]],
        outputs=error)

    mean_rating = np.mean(trainvec[:, 2])
    done_looping = False
    epoch = 0
    N = len(trainvec)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        totalErrors = 0
        testErrors = 0
        for k in range(int(math.floor(N / batch_size))):
            batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size))
            idi = trainvec[batch, 0] - 1
            idj = trainvec[batch, 1] - 1
            ratings = trainvec[batch, 2] - mean_rating
            minibatch_cost = train_model(idi, idj, ratings)
            totalErrors += minibatch_cost

        NN = len(testvec)
        batch_size = 1000
        for k in range(int(math.floor(NN / batch_size))):
            batch = np.arange(k * batch_size, min(NN - 1,
                                                  (k + 1) * batch_size))
            p_idx = testvec[batch, 0] - 1
            q_idx = testvec[batch, 1] - 1
            ratings = testvec[batch, 2] - mean_rating
            testErrors += test_model(p_idx, q_idx, ratings)
        print(
            "the training cost at epoch {} is {}, and the testing error is {}".
            format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN)))

        # test it on the test dataset
    NN = len(testvec)
    batch_size = 1000
    diff = 0
    for k in range(int(math.floor(NN / batch_size))):
        batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size))
        p_idx = testvec[batch, 0] - 1
        q_idx = testvec[batch, 1] - 1
        ratings = testvec[batch, 2] - mean_rating
        diff += test_model(p_idx, q_idx, ratings)

    print("Total average test error for {} instances is {}".format(
        NN, np.sqrt(diff / NN)))
def train_model(new_training_job, config, save_path, params, fast_start,
                fuel_server, seed):
    c = config
    if seed:
        fuel.config.default_seed = seed
        blocks.config.config.default_seed = seed

    data, model = initialize_data_and_model(config, train_phase=True)

    # full main loop can be saved...
    main_loop_path = os.path.join(save_path, 'main_loop.tar')
    # or only state (log + params) which can be useful not to pickle embeddings
    state_path = os.path.join(save_path, 'training_state.tar')
    stream_path = os.path.join(save_path, 'stream.pkl')
    best_tar_path = os.path.join(save_path, "best_model.tar")

    keys = tensor.lmatrix('keys')
    n_identical_keys = tensor.lvector('n_identical_keys')
    words = tensor.ltensor3('words')
    words_mask = tensor.matrix('words_mask')
    if theano.config.compute_test_value != 'off':
        #TODO
        test_value_data = next(
            data.get_stream('train', batch_size=4,
                            max_length=5).get_epoch_iterator())
        words.tag.test_value = test_value_data[0]
        words_mask.tag.test_value = test_value_data[1]

    if use_keys(c) and use_n_identical_keys(c):
        costs = model.apply(words,
                            words_mask,
                            keys,
                            n_identical_keys,
                            train_phase=True)
    elif use_keys(c):
        costs = model.apply(words, words_mask, keys, train_phase=True)
    else:
        costs = model.apply(words, words_mask, train_phase=True)
    cost = rename(costs.mean(), 'mean_cost')

    cg = Model(cost)
    if params:
        logger.debug("Load parameters from {}".format(params))
        with open(params) as src:
            cg.set_parameter_values(load_parameters(src))

    length = rename(words.shape[1], 'length')
    perplexity, = VariableFilter(name='perplexity')(cg)
    monitored_vars = [length, cost, perplexity]
    if c['proximity_coef']:
        proximity_term, = VariableFilter(name='proximity_term')(cg)
        monitored_vars.append(proximity_term)

    print "inputs of the model:", cg.inputs

    parameters = cg.get_parameter_dict()
    trained_parameters = parameters.values()
    saved_parameters = parameters.values()
    if c['embedding_path']:
        if c['freeze_pretrained']:
            logger.debug(
                "Exclude pretrained encoder embeddings from the trained parameters"
            )
            to_freeze = 'main'
        elif c['provide_targets']:
            logger.debug(
                "Exclude pretrained targets from the trained parameters")
            to_freeze = 'target'
        trained_parameters = [
            p for p in trained_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]
        saved_parameters = [
            p for p in saved_parameters
            if not p == model.get_def_embeddings_params(to_freeze)
        ]

    logger.info("Cost parameters" + "\n" + pprint.pformat([
        " ".join(
            (key, str(parameters[key].get_value().shape),
             'trained' if parameters[key] in trained_parameters else 'frozen'))
        for key in sorted(parameters.keys())
    ],
                                                          width=120))

    rules = []
    if c['grad_clip_threshold']:
        rules.append(StepClipping(c['grad_clip_threshold']))
    rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum']))
    algorithm = GradientDescent(cost=cost,
                                parameters=trained_parameters,
                                step_rule=CompositeRule(rules))

    train_monitored_vars = list(monitored_vars)
    if c['grad_clip_threshold']:
        train_monitored_vars.append(algorithm.total_gradient_norm)

    if c['monitor_parameters']:
        train_monitored_vars.extend(parameter_stats(parameters, algorithm))

    # We use a completely random seed on purpose. With Fuel server
    # it's currently not possible to restore the state of the training
    # stream. That's why it's probably better to just have it stateless.
    stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None
    training_stream = data.get_stream(
        'train',
        batch_size=c['batch_size'],
        max_length=c['max_length'],
        seed=stream_seed,
        remove_keys=not use_keys(c),
        remove_n_identical_keys=not use_n_identical_keys(c))
    print "trainin_stream will contains sources:", training_stream.sources

    original_training_stream = training_stream
    if fuel_server:
        # the port will be configured by the StartFuelServer extension
        training_stream = ServerDataStream(
            sources=training_stream.sources,
            produces_examples=training_stream.produces_examples)

    validate = c['mon_freq_valid'] > 0

    if validate:
        valid_stream = data.get_stream(
            'valid',
            batch_size=c['batch_size_valid'],
            max_length=c['max_length'],
            seed=stream_seed,
            remove_keys=not use_keys(c),
            remove_n_identical_keys=not use_n_identical_keys(c))
        validation = DataStreamMonitoring(
            monitored_vars, valid_stream,
            prefix="valid").set_conditions(before_first_epoch=not fast_start,
                                           on_resumption=True,
                                           every_n_batches=c['mon_freq_valid'])
        track_the_best = TrackTheBest(validation.record_name(cost),
                                      choose_best=min).set_conditions(
                                          on_resumption=True,
                                          after_epoch=True,
                                          every_n_batches=c['mon_freq_valid'])

    # don't save them the entire main loop to avoid pickling everything
    if c['fast_checkpoint']:
        cp_path = state_path
        load = (LoadNoUnpickling(cp_path,
                                 load_iteration_state=True,
                                 load_log=True).set_conditions(
                                     before_training=not new_training_job))
        cp_args = {
            'save_main_loop': False,
            'save_separately': ['log', 'iteration_state'],
            'parameters': saved_parameters
        }

    else:
        cp_path = main_loop_path
        load = (Load(cp_path, load_iteration_state=True,
                     load_log=True).set_conditions(
                         before_training=not new_training_job))
        cp_args = {
            'save_separately': ['iteration_state'],
            'parameters': saved_parameters
        }

    checkpoint = Checkpoint(cp_path,
                            before_training=not fast_start,
                            every_n_batches=c['save_freq_batches'],
                            after_training=not fast_start,
                            **cp_args)

    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        intermediate_cp = IntermediateCheckpoint(
            cp_path,
            every_n_epochs=c['checkpoint_every_n_epochs'],
            every_n_batches=c['checkpoint_every_n_batches'],
            after_training=False,
            **cp_args)

    if validate:
        checkpoint = checkpoint.add_condition(
            ['after_batch', 'after_epoch'],
            OnLogRecord(track_the_best.notification_name), (best_tar_path, ))

    extensions = [
        load,
        StartFuelServer(original_training_stream,
                        stream_path,
                        before_training=fuel_server),
        Timing(every_n_batches=c['mon_freq_train'])
    ]

    extensions.extend([
        TrainingDataMonitoring(train_monitored_vars,
                               prefix="train",
                               every_n_batches=c['mon_freq_train']),
    ])
    if validate:
        extensions.extend([validation, track_the_best])

    extensions.append(checkpoint)
    if c['checkpoint_every_n_batches'] > 0 or c[
            'checkpoint_every_n_epochs'] > 0:
        extensions.append(intermediate_cp)
    extensions.extend(
        [Printing(on_resumption=True, every_n_batches=c['mon_freq_train'])])

    if validate and c['n_valid_early'] > 0:
        extensions.append(
            FinishIfNoImprovementAfter(track_the_best.notification_name,
                                       iterations=c['n_valid_early'] *
                                       c['mon_freq_valid'],
                                       every_n_batches=c['mon_freq_valid']))
    extensions.append(FinishAfter(after_n_epochs=c['n_epochs']))

    logger.info("monitored variables during training:" + "\n" +
                pprint.pformat(train_monitored_vars, width=120))
    logger.info("monitored variables during valid:" + "\n" +
                pprint.pformat(monitored_vars, width=120))

    main_loop = MainLoop(algorithm,
                         training_stream,
                         model=Model(cost),
                         extensions=extensions)

    main_loop.run()
Exemple #33
0
def _construct_mlp(datasets,
                   learning_rate=0.01,
                   L1_reg=0.00,
                   L2_reg=0.0001,
                   n_epochs=1000,
                   batch_size=20,
                   n_hidden=200):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    Note: Parameters need tuning.

    :type datasets: tuple
    :param datasets: (inputs, targets)

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type batch_size: int
    :param batch_size: number of examples in one batch

    :type n_hidden: int
    :param n_hidden: number of hidden units to be used in class HiddenLayer

     """
    inputs, targets = datasets
    temp_train_set_x = []
    temp_train_set_y = []
    train_set_x = []
    train_set_y = []
    valid_set_x = []
    valid_set_y = []
    test_set_x = []
    test_set_y = []

    # stratified k-fold to split test and temporary train, which contains
    # validation and train
    skf = StratifiedShuffleSplit(targets, 1, 0.2)
    for temp_train_index, test_index in skf:
        # print("TEMP_TRAIN:", temp_train_index, "TEST:", test_index)
        temp_train_set_x.append(inputs[temp_train_index])
        temp_train_set_y.append(targets[temp_train_index])
        test_set_x.append(inputs[test_index])
        test_set_y.append(targets[test_index])

    # convert from list-wrapping array to array
    test_set_x = test_set_x[0]
    test_set_y = test_set_y[0]
    temp_train_set_x = temp_train_set_x[0]
    temp_train_set_y = temp_train_set_y[0]

    # stratified k-fold to split valid and train
    skf = StratifiedShuffleSplit(temp_train_set_y, 1, 0.25)
    for train_index, valid_index in skf:
        # print("TRAIN: ", train_index, ", VALID: ", valid_index)
        train_set_x.append(temp_train_set_x[train_index])
        train_set_y.append(temp_train_set_y[train_index])
        valid_set_x.append(temp_train_set_x[valid_index])
        valid_set_y.append(temp_train_set_y[valid_index])

    # convert from list-wrapping array to array
    train_set_x = train_set_x[0]
    train_set_y = train_set_y[0]
    valid_set_x = valid_set_x[0]
    valid_set_y = valid_set_y[0]

    # check shape
    # print("train_set_x shape: " + str(train_set_x.shape))
    # print("train_set_y shape: " + str(train_set_y.shape))
    # print("valid_set_x shape: " + str(valid_set_x.shape))
    # print("valid_set_y shape: " + str(valid_set_y.shape))
    # print("test_set_x shape: " + str(test_set_x.shape))
    # print("test_set_y shape: " + str(test_set_y.shape))

    # convert to theano.shared variable
    train_set_x = theano.shared(value=train_set_x, name='train_set_x')
    train_set_y = theano.shared(value=train_set_y, name='train_set_y')
    valid_set_x = theano.shared(value=valid_set_x, name='valid_set_x')
    valid_set_y = theano.shared(value=valid_set_y, name='valid_set_y')
    test_set_x = theano.shared(value=test_set_x, name='test_set_x')
    test_set_y = theano.shared(value=test_set_y, name='test_set_y')

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(train_set_x.get_value().shape[0] / batch_size)
    n_valid_batches = int(valid_set_x.get_value().shape[0] / batch_size)
    n_test_batches = int(test_set_x.get_value().shape[0] / batch_size)

    # check batch
    # print("n_train_batches:" + str(n_train_batches))
    # print("n_valid_batches:" + str(n_valid_batches))
    # print("n_test_batches:" + str(n_test_batches))

    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.lvector('y')  # the labels are presented as 1D vector of [int] labels

    # set a random state that is related to the time
    # noinspection PyUnresolvedReferences
    rng = numpy.random.RandomState(int((time.time())))

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input_=x,
                     n_in=_std_height * _std_width,
                     n_hidden=n_hidden,
                     n_out=len(_captcha_provider.chars))

    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        },
        mode='FAST_RUN')

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        },
        mode='FAST_RUN')

    # compute the gradient of cost with respect to theta (sorted in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        mode='FAST_RUN')

    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant

    if T.lt(n_train_batches, patience / 2):
        validation_frequency = n_train_batches
    else:
        validation_frequency = patience / 2
    # go through this many minibatches before checking the network
    # on the validation set; in this case we check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for minibatch_index in range(n_train_batches):
            # noinspection PyUnusedLocal
            minibatch_avg_cost = train_model(minibatch_index)
            iteration = (epoch - 1) * n_train_batches + minibatch_index

            if (iteration + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch {0}, minibatch {1}/{2}, validation error {3}'.
                      format(epoch, minibatch_index + 1, n_train_batches,
                             this_validation_loss * 100.))
                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iteration * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iteration

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(
                        '    epoch {0}, minibatch {1}/{2}, test error of best '
                        'model {3}'.format(epoch, minibatch_index + 1,
                                           n_train_batches, test_score * 100))

            if patience <= iteration:
                done_looping = True
                break

    end_time = time.time()
    print('Optimization complete. Best validation score of {0} obtained at '
          'iteration {1}, with test performance {2}'.format(
              best_validation_loss * 100, best_iter + 1, test_score * 100))
    print('Time used for testing the mlp is', end_time - start_time)
    return classifier
Exemple #34
0
import numpy
import theano.tensor as T
from theano import shared, function

x = T.matrix()
y = T.lvector()
w = shared(numpy.random.randn(100))
b = shared(numpy.zeros(()))
print "Initial model:"
print w.get_value(), b.get_value()
Exemple #35
0
    def __init__(self, pa):

        self.input_height = pa.network_input_height
        self.input_width = pa.network_input_width
        self.output_height = pa.network_output_dim

        self.num_frames = pa.num_frames

        self.update_counter = 0

        states = T.tensor4(
            'states'
        )  # states is [batch_size,channel_size,input_height,input_width ]
        actions = T.lvector(
            'actions'
        )  # actions shape is batch_size.It is action that network choose
        values = T.vector(
            'values'
        )  # values shape is batch_size. It is value that action stand for.

        print('network_input_height=', pa.network_input_height)
        print('network_input_width=', pa.network_input_width)
        print('network_output_dim=', pa.network_output_dim)

        # image represent
        self.l_out = build_pg_network(self.input_height, self.input_width,
                                      self.output_height)

        self.lr_rate = pa.lr_rate
        self.rms_rho = pa.rms_rho
        self.rms_eps = pa.rms_eps

        params = lasagne.layers.helper.get_all_params(self.l_out)

        print('params=', params, 'counts',
              lasagne.layers.count_params(self.l_out))

        self._get_param = theano.function([], params)

        # =====================================
        #    Training
        #======================================

        prob_act = lasagne.layers.get_output(
            self.l_out,
            states)  # shape is [batch_size ,output_height].It is action prob.

        self._get_act_prob = theano.function([states],
                                             prob_act,
                                             allow_input_downcast=True)

        #=======================================
        # policy gradients
        #=======================================

        N = states.shape[0]

        loss = T.log(prob_act[T.arange(N), actions]).dot(values) / N

        grads = T.grad(loss, params)

        updates = rmsprop_updates(grads, params, self.lr_rate, self.rms_rho,
                                  self.rms_eps)

        self._train_fn = theano.function([states, actions, values],
                                         loss,
                                         updates=updates,
                                         allow_input_downcast=True)

        self._get_loss = theano.function([states, actions, values],
                                         loss,
                                         allow_input_downcast=True)
        self._get_grad = theano.function([states, actions, values], grads)

        # -------supervised learning --------------------

        su_target = T.ivector('su_target')
        su_loss = lasagne.objectives.categorical_crossentropy(
            prob_act, su_target)
        su_loss = su_loss.mean()

        l2_penalty = lasagne.regularization.regularize_network_params(
            self.l_out, lasagne.regularization.l2)
        # l1_penalty = lasagne.regularization.regularize_network_params(self.l_out, lasagne.regularization.l1)

        su_loss += 1e-3 * l2_penalty
        print('lr_rate=', self.lr_rate)

        su_updates = lasagne.updates.rmsprop(su_loss, params, self.lr_rate,
                                             self.rms_rho, self.rms_eps)
        self._su_train_fn = theano.function([states, su_target],
                                            [su_loss, prob_act],
                                            updates=su_updates)

        self._su_loss = theano.function([states, su_target],
                                        [su_loss, prob_act])

        self._debug = theano.function([states], [states.flatten(2)])
Exemple #36
0
    def __theano_build__(self):
        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        x_a = T.ivector('x_a')
        x_b = T.ivector('x_b')
        y = T.lvector('y')

        def forward_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t_prev))
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t_prev))
            c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t))
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        # sentence a vector (states)
        a_s, updates = theano.scan(forward_step,
                                   sequences=x_a,
                                   truncate_gradient=self.bptt_truncate,
                                   outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states)
        b_s, updates = theano.scan(forward_step,
                                   sequences=x_b,
                                   truncate_gradient=self.bptt_truncate,
                                   outputs_info=T.zeros(self.hidden_dim))

        # semantic similarity
        # s_sim = manhattan_distance(a_s[-1],b_s[-1])

        # for classification using simple strategy
        sena = a_s[-1]
        senb = b_s[-1]

        combined_s = T.concatenate([sena, senb], axis=0)

        # softmax class
        o = T.nnet.softmax(V.dot(combined_s) + c)[0]

        # in case the o contains 0 which cause inf
        eps = np.asarray([1.0e-10] * self.label_dim,
                         dtype=theano.config.floatX)
        o = o + eps
        om = o.reshape((1, o.shape[0]))
        prediction = T.argmax(om, axis=1)
        o_error = T.nnet.categorical_crossentropy(om, y)

        # cost
        cost = T.sum(o_error)

        # updates
        updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost)

        # monitor parameter
        mV = V * T.ones_like(V)
        mc = c * T.ones_like(c)
        mU = U * T.ones_like(U)
        mW = W * T.ones_like(W)

        gV = T.grad(cost, V)
        gc = T.grad(cost, c)
        gU = T.grad(cost, U)
        gW = T.grad(cost, W)

        mgV = gV * T.ones_like(gV)
        mgc = gc * T.ones_like(gc)
        mgU = gU * T.ones_like(gU)
        mgW = gW * T.ones_like(gW)

        # Assign functions
        self.monitor = theano.function([x_a, x_b],
                                       [sena, senb, mV, mc, mU, mW])
        self.monitor_grad = theano.function([x_a, x_b, y],
                                            [mgV, mgc, mgU, mgW])
        self.predict = theano.function([x_a, x_b], om)
        self.predict_class = theano.function([x_a, x_b], prediction)
        self.ce_error = theano.function([x_a, x_b, y], cost)
        # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        # find the nan
        self.sgd_step = theano.function(
            [x_a, x_b, y], [],
            updates=updates
            # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        )
Exemple #37
0
def sgd_optimization_mnist(learning_rate=2e-2, loss_weight = 1.8e+8, curriculum_rate=0.1, n_curriculum_epochs=300, epoch_iters = 20, converge = 1e-4,
                           minibatch_size = 50, 
                           batch_size=4, k = 4, func = 'concavefeature', func_parameter = 0.5, deep = True):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    print('loading data...')
    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    labels_, cluster_centers_, center_nn = datasets[3]
    num_cluster = cluster_centers_.shape[0]
    isize = int(numpy.sqrt(train_set_x.get_value(borrow=True).shape[1]))


    # compute number of minibatches for training, validation and testing
    n_train = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches = n_train // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('building the model...')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    cindex = T.lvector()  # index to a [mini]batch


    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    if deep is False:

        # construct the logistic regression class
        # Each MNIST image has size 28*28
        classifier = LogisticRegression(input=x, n_in=isize**2, n_out=10)

        # the cost we minimize during training is the negative log likelihood of
        # the model in symbolic format
        cost = classifier.negative_log_likelihood(y)
        cost_vec = classifier.negative_log_likelihood_vec(y)

        # compute the gradient of cost with respect to theta = (W,b)
        g_W = T.grad(cost=cost, wrt=classifier.W)
        g_b = T.grad(cost=cost, wrt=classifier.b)

        # start-snippet-3
        # specify how to update the parameters of the model as a list of
        # (variable, update expression) pairs.
        updates = [(classifier.W, classifier.W - learning_rate * g_W),
                   (classifier.b, classifier.b - learning_rate * g_b)]
    else:

        nfea = 500
        nkerns=[20, 50]
        n_channels = 1
        rng = numpy.random.RandomState(23455)

        layer0_input = x.reshape((-1, 1, isize, isize))

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
        layer0 = LeNetConvPoolLayer(
            rng,
            input=layer0_input,
            image_shape=(None, 1, isize, isize),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2)
        )

        isize1 = int((isize - 5 + 1)/2)

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        layer1 = LeNetConvPoolLayer(
            rng,
            input=layer0.output,
            image_shape=(None, nkerns[0], isize1, isize1),
            filter_shape=(nkerns[1], nkerns[0], 5, 5),
            poolsize=(2, 2)
        )

        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
        # or (500, 50 * 4 * 4) = (500, 800) with the default values.
        layer2_input = layer1.output.flatten(2)

        isize2 = int((isize1 - 5 + 1)/2)

        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(
            rng,
            input=layer2_input,
            n_in=nkerns[1] * isize2 * isize2,
            n_out=nfea,
            activation=T.tanh
        )

        # classify the values of the fully-connected sigmoidal layer
        classifier = LogisticRegression(input=layer2.output, n_in=nfea, n_out=10)

        # the cost we minimize during training is the NLL of the model
        cost = classifier.negative_log_likelihood(y)
        cost_vec = classifier.negative_log_likelihood_vec(y)

        # create a list of all model parameters to be fit by gradient descent
        params = classifier.params + layer2.params + layer1.params + layer0.params

        # create a list of gradients for all model parameters
        grads = T.grad(cost, params)

        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i], grads[i]) pairs.
        updates = [
            (param_i, param_i - learning_rate * grad_i)
            for param_i, grad_i in zip(params, grads)
        ]

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[cindex],
        outputs=classifier.errors(y),
        updates=updates,
        givens={
            x: train_set_x[cindex],
            y: train_set_y[cindex]
        }
    )

    loss_model = theano.function(
        inputs=[cindex],
        outputs=cost_vec,
        givens={
            x: train_set_x[cindex],
            y: train_set_y[cindex]
        }
    )
    error_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('training the model...')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    #validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    #initialize
    minGain, sinGain, optSubmodular = initSubmodularFunc(cluster_centers_, k)
    real_iter = 0
    validation_frequency = 100
    old_epoch_all_loss = float('inf')
    loss_weight0 = loss_weight
    passed_index = numpy.array([])
    passed_index_epoch = numpy.array([]) 
    passes = 0
    output_seq = ()
    for curriculum_epoch in range(n_curriculum_epochs):

        print('Epoch', curriculum_epoch)
        old_all_loss = 0
        for iters in range(epoch_iters):

            if len(passed_index) <= n_train*0.45:
                # compute loss
                loss_vec = loss_model(center_nn) * loss_weight / len(center_nn)
                all_loss = sum(loss_vec)
                #loss_vec_center = numpy.asarray([sum(loss_vec[labels_ == i]) for i in range(num_cluster)])
                loss_vec_center = loss_vec
                topkLoss = sum(numpy.partition(loss_vec_center, -k)[-k:])
                optObj = optSubmodular + topkLoss
                print(optSubmodular, topkLoss)

                # update A (topkIndex)
                left_index = pruneGroundSet(minGain, sinGain, loss_vec_center, k)
                topkIndex = modularLowerBound(cluster_centers_[left_index,:], k, func, func_parameter, loss_vec_center[left_index], optObj)
                topkIndex = left_index[topkIndex]

                # update classifier (train_model)           
                train_index = numpy.array([])
                for i in range(len(topkIndex)):
                    train_index = numpy.append(train_index, numpy.where(labels_ == topkIndex[i])[0])
                train_index = numpy.random.permutation(train_index.astype(int))
                print('number of training samples =', len(train_index))
                passes += len(train_index)
                passed_index = numpy.unique(numpy.append(passed_index, train_index))
                passed_index_epoch = numpy.unique(numpy.append(passed_index_epoch, train_index))

            else:

                train_index = numpy.random.permutation(numpy.setxor1d(numpy.arange(n_train), passed_index_epoch).astype(int))
                #train_index = numpy.random.permutation(numpy.arange(n_train).astype(int))
                passes += len(train_index)
                passed_index_epoch = numpy.array([])
                #passed_index = numpy.arange(n_train)
                
            # training by mini-batch sgd
            start_index = 0
            train_loss = numpy.array([])
            while start_index < len(train_index):
                end_index = min([start_index + minibatch_size, len(train_index)])
                batch_index = train_index[start_index : end_index]
                start_index = end_index
                train_loss = numpy.append(train_loss, train_model(batch_index))
            this_train_loss = numpy.mean(train_loss)

            # stop the current epoch if converge
            diff_loss = old_all_loss - all_loss
            if  diff_loss >= 0 and diff_loss <= all_loss * converge:
                break
            # show validation and test error peoriodically
            else:
                old_all_loss = all_loss
                if (iters + real_iter + 1) % validation_frequency == 0:
                    # compute zero-one loss on validation set
                    validation_losses = [validate_model(i)
                                         for i in range(n_valid_batches)]
                    this_validation_loss = numpy.mean(validation_losses)
                    test_losses = [test_model(i)
                                    for i in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    train_score = [error_model(i) 
                                    for i in range(n_train_batches)]
                    this_train_score = numpy.mean(train_score)

                    print(
                        'minibatch %i, %i trainings, %i passes, trainErr %f %%, validErr %f %%, testErr %f %%' %
                        (
                            iters + real_iter + 1,
                            len(passed_index),
                            passes,
                            this_train_score * 100.,
                            this_validation_loss * 100.,
                            test_score * 100.
                        )
                    )

                    output_seq = output_seq + (numpy.array([len(passed_index),passes,this_train_score * 100.,this_validation_loss * 100.,test_score * 100.]),)

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, (iters + real_iter + 1) * patience_increase)

                        best_validation_loss = this_validation_loss

                        # save the best model
                        with open('best_model.pkl', 'wb') as f:
                            pickle.dump(classifier, f)

        #print('Up to now %i training samples are used'%(len(passed_index)))
        # record total number of iterations
        real_iter += iters
        # adjust learning rate
        if all_loss > 1.001 * old_epoch_all_loss:
            print('no improvement: reduce learning rate!')
            learning_rate *= 0.96
        old_epoch_all_loss = all_loss
        # increase curriculum rate
        loss_weight *= curriculum_rate + 1

        if patience <= iters + real_iter + 1:
            break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    #print('The code run for %d epochs, with %f epochs/sec' % (
        #epoch, 1. * epoch / (end_time - start_time)))
    #print(('The code for file ' +
           #os.path.split(__file__)[1] +
           #' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
    output_seq = numpy.vstack(output_seq)
    return output_seq
Exemple #38
0
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias

    We check that we loop when their is too much threads

    """

    n_in = 1000
    batch_size = 4097
    n_out = 1250

    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        n_in = 4098
        n_out = 4099

    x = T.fmatrix('x')
    y = T.lvector('y')

    b = T.fvector('b')
    #W = T.fmatrix('W')

    #we precompute the dot with big shape before to allow the test of
    #GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
    #(the launch timed out and was terminated) on GPU card not
    #powerful enough. We need the big shape to check for corner
    #case.
    dot_result = T.fmatrix('dot_result')

    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()

    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
                       dtype=numpy.float32)
    #?????yy = numpy.ones((batch_size,),dtype='float32')
    yy = numpy.ones((batch_size, ), dtype='int32')
    b_values = numpy.zeros((n_out, ), dtype='float32')
    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')

    dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
    del W_values
    p_y_given_x = T.nnet.softmax(dot_result + b)
    y_pred = T.argmax(p_y_given_x, axis=-1)
    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
    dW = T.grad(loss, dot_result)
    classify = theano.function(inputs=[y, b, dot_result],
                               outputs=[loss, y_pred, dW],
                               mode=mode_without_gpu)
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)
    #theano.printing.debugprint(classify)
    #theano.printing.debugprint(classify_gpu)

    assert any([
        isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
        for node in classify.maker.fgraph.toposort()
    ])
    assert any([
        isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias)
        for node in classify_gpu.maker.fgraph.toposort()
    ])

    out = classify(yy, b_values, dot_value)
    gout = classify_gpu(yy, b_values, dot_value)

    assert len(out) == len(gout) == 3
    assert numpy.allclose(out[0], gout[0])
    assert numpy.allclose(out[2], gout[2],
                          atol=3e-6), numpy.absolute(gout[2] - out[2]).max()
    assert numpy.allclose(out[1],
                          gout[1]), [(id, out[1][id], gout[1][id], val)
                                     for id, val in enumerate(out[1] - gout[1])
                                     if val != 0]
Exemple #39
0
def test_RNN(nh=100, nl=3, n_in=32):

    WS_file_filter_regex = r'WS_P[0-9]*_S[0-9].mat'
    WS_file_filter_regex_P1 = r'WS_P1_S[0-9].mat'
    AllLifts_P1 = r'P1_AllLifts.mat'
    # nearest element in list min(myList, key=lambda x:abs(x-myNumber))
    # Wait until implemented in brain.data.util

    # needed are eeg and eeg_t data as train set data
    # for now train and test set contain empty lists
    train_set, valid_set, test_set = load_data(participant=1)

    train_set_x, train_set_y = train_set
    valid_set_x, valid_set_y = valid_set
    test_set_x, test_set_y = test_set

    data = getTables(WS_file_filter_regex_P1)
    datasetSize = len(data)

    print(data)
    print(datasetSize)

    # HandStart(33), LiftOff(18)
    # returns list targets
    event_data = getRaw(AllLifts_P1)[0]['P']['AllLifts']

    handStart = event_data[:, 33]
    liftOff = event_data[:, 18]

    # get nearest index in eeg data for given event and get length of longest/shortest eeg window
    handStartIndex = []
    liftOffIndex = []
    maxLengthEEG = 0
    minLengthEEG = numpy.inf
    for i in range(len(data)):
        handStartIndex.append(
            numpy.where(data[i]['eeg_t'] == min(
                data[i]['eeg_t'], key=lambda x: abs(handStart[i] - x)))[0][0])
        liftOffIndex.append(
            numpy.where(data[i]['eeg_t'] == min(
                data[i]['eeg_t'], key=lambda x: abs(liftOff[i] - x)))[0][0])
        if len(data[i]['eeg']) > maxLengthEEG:
            maxLengthEEG = len(data[i]['eeg'])
        if len(data[i]['eeg']) < minLengthEEG:
            minLengthEEG = len(data[i]['eeg'])

    sequenceLength = maxLengthEEG

    # Construct target vectors (0 = 'none' event)
    targets = numpy.zeros((datasetSize, sequenceLength), dtype='int64')
    for i in range(datasetSize):
        targets[i][handStartIndex[i] - 100:handStartIndex[i] + 100] = 1
        targets[i][liftOffIndex[i] - 100:liftOffIndex[i] + 100] = 2
        #print(str(handStartIndex[i]) + " " + str(liftOffIndex[i]))

    # Construct data array with 0 padding at the end for shorter sequences
    eeg_data = numpy.zeros((datasetSize, sequenceLength, 32))
    for i in range(datasetSize):
        eeg_data[i, 0:data[i]['eeg'].shape[0]] = data[i]['eeg']
        #eeg_data[i, :] = data[i]['eeg'][0: sequenceLength]

    tmpl = [(n_in, nh), (nh, nh), (nh, nl), nh, nl, nh]
    wrt, (Wx, Wh, W, bh, b, h0) = climin.util.empty_with_views(tmpl)
    params = [Wx, Wh, W, bh, b, h0]

    x = T.dmatrix('x')
    y = T.lvector('y')

    classifier = RNN(x, y, nh, nl, n_in)

    # copy preinitialized weight matrices
    Wx[...] = classifier.Wx.get_value(borrow=True)[...]
    Wh[...] = classifier.Wh.get_value(borrow=True)[...]
    W[...] = classifier.W.get_value(borrow=True)[...]

    def set_pars():
        for p, p_class in zip(params, classifier.params):
            p_class.set_value(p, borrow=True)

    def loss(parameters, inpt, targets):
        set_pars()
        return classifier.cost.eval({x: inpt[0], y: targets[0]})

    def d_loss_wrt_pars(parameters, inpt, targets):
        set_pars()
        grads = []
        print(loss(parameters, inpt, targets))
        for d in classifier.gradients:
            grads.append(d.eval({x: inpt[0], y: targets[0]}))
        return numpy.concatenate([
            grads[0].flatten(), grads[1].flatten(), grads[2].flatten(),
            grads[3], grads[4], grads[5]
        ])

    args = ((i, {}) for i in climin.util.iter_minibatches(
        [eeg_data[0:1], targets[0:1]], 1, [0, 0]))

    opt = climin.adadelta.Adadelta(wrt,
                                   d_loss_wrt_pars,
                                   step_rate=1,
                                   decay=0.9,
                                   momentum=0,
                                   offset=0.0001,
                                   args=args)

    def plot():
        figure, (axes) = plt.subplots(4, 1)

        x_axis = numpy.arange(sequenceLength)

        result = classifier.result_sequence.eval({x: eeg_data[0]})

        axes[0].set_title("labels")
        axes[0].plot(x_axis, targets[0], label="targets")
        axes[1].set_title("none_prob")
        axes[1].plot(x_axis, result[:, 0], label="none")
        axes[2].set_title("handStart_prob")
        axes[2].plot(x_axis, result[:, 1], label="handStart")
        axes[3].set_title("liftOff_prob")
        axes[3].plot(x_axis, result[:, 2], label="liftOff")

        figure.subplots_adjust(hspace=0.5)

        figure.savefig('test.png')

        plt.close(figure)

    for info in opt:
        iteration = info['n_iter']
        if iteration % 10 == 0:
            plot()
        if iteration > 500:
            break

    plot()
Exemple #40
0
    def __init__(self, config):
        self.config = config
        batch_size = config['batch_size']
        flag_datalayer = config['use_data_layer']

        # ##################### BUILD NETWORK ##########################
        # allocate symbolic variables for the data
        # 'rand' is a random array used for random cropping/mirroring of data

        # [2015-11-11] Jinlong note: for CPU version, a bug exit for group=2,
        # since memory is not continuous; Will fix it depends on requirement

        x = T.ftensor4('x')
        y = T.lvector('y')
        rand = T.fvector('rand')

        print '... building the model'
        self.layers = []
        params = []
        weight_types = []

        if flag_datalayer:
            data_layer = DataLayer(input=x,
                                   image_shape=(batch_size, 3, 256, 256),
                                   cropsize=227,
                                   rand=rand,
                                   mirror=True,
                                   flag_rand=config['rand_crop'])
            layer1_input = data_layer.output
        else:
            layer1_input = x

        convpool_layer1 = ConvPoolLayer(
            input=layer1_input,
            image_shape=(batch_size, 3, 227, 227),
            filter_shape=(96, 3, 11, 11),
            convstride=4,
            padsize=0,
            group=1,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            lrn=True,
        )
        self.layers.append(convpool_layer1)

        params += convpool_layer1.params
        weight_types += convpool_layer1.weight_type

        convpool_layer2 = ConvPoolLayer(
            input=convpool_layer1.output,
            image_shape=(batch_size, 96, 27, 27),
            filter_shape=(256, 96, 5, 5),
            convstride=1,
            padsize=2,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.1,
            lrn=True,
        )
        self.layers.append(convpool_layer2)
        params += convpool_layer2.params
        weight_types += convpool_layer2.weight_type

        convpool_layer3 = ConvPoolLayer(
            input=convpool_layer2.output,
            image_shape=(batch_size, 256, 13, 13),
            filter_shape=(384, 256, 3, 3),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            lrn=False,
        )
        self.layers.append(convpool_layer3)
        params += convpool_layer3.params
        weight_types += convpool_layer3.weight_type

        convpool_layer4 = ConvPoolLayer(
            input=convpool_layer3.output,
            image_shape=(batch_size, 384, 13, 13),
            filter_shape=(384, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            lrn=False,
        )
        self.layers.append(convpool_layer4)
        params += convpool_layer4.params
        weight_types += convpool_layer4.weight_type

        convpool_layer5 = ConvPoolLayer(
            input=convpool_layer4.output,
            image_shape=(batch_size, 384, 13, 13),
            filter_shape=(256, 384, 3, 3),
            convstride=1,
            padsize=1,
            group=2,
            poolsize=3,
            poolstride=2,
            bias_init=0.1,
            lrn=False,
        )
        self.layers.append(convpool_layer5)
        params += convpool_layer5.params
        weight_types += convpool_layer5.weight_type

        fc_layer6_input = T.flatten(convpool_layer5.output, 2)
        fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096)
        self.layers.append(fc_layer6)
        params += fc_layer6.params
        weight_types += fc_layer6.weight_type

        dropout_layer6 = DropoutLayer(fc_layer6.output, n_in=4096, n_out=4096)

        fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096)
        self.layers.append(fc_layer7)
        params += fc_layer7.params
        weight_types += fc_layer7.weight_type

        dropout_layer7 = DropoutLayer(fc_layer7.output, n_in=4096, n_out=4096)

        softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output,
                                      n_in=4096,
                                      n_out=1000)
        self.layers.append(softmax_layer8)
        params += softmax_layer8.params
        weight_types += softmax_layer8.weight_type

        # #################### NETWORK BUILT #######################
        self.cost = softmax_layer8.negative_log_likelihood(y)
        self.errors = softmax_layer8.errors(y)
        self.errors_top_5 = softmax_layer8.errors_top_x(y, 5)
        self.params = params
        self.x = x
        self.y = y
        self.rand = rand
        self.weight_types = weight_types
        self.batch_size = batch_size
    def build_finetune_functions(self, datasets, batch_size, learning_rate,
                                 L1_param, L2_param, mom):
        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]
        index = T.lvector('index')

        gparams = T.grad(
            self.dropout_negative_log_likelihood + L1_param * self.L1 +
            L2_param * self.L2, self.params)

        self.gparams_mom = []
        for param in self.params:
            gparam_mom = theano.shared(
                numpy.zeros(param.get_value(borrow=True).shape,
                            dtype=theano.config.floatX))
            self.gparams_mom.append(gparam_mom)

        updates1 = OrderedDict()
        for param, gparam, gparam_mom in zip(self.params, gparams,
                                             self.gparams_mom):
            updates1[gparam_mom] = mom * gparam_mom - learning_rate * gparam
            updates1[param] = param + updates1[gparam_mom]

        train_model = theano.function(
            inputs=[index],
            outputs=self.dropout_negative_log_likelihood,
            updates=updates1,
            givens={
                self.x: train_set_x[index],
                self.y: train_set_y[index]
            })
        # error check
        train_error_fn = theano.function(inputs=[index],
                                         outputs=self.error,
                                         givens={
                                             self.x: train_set_x[index],
                                             self.y: train_set_y[index]
                                         })
        valid_error_fn = theano.function(inputs=[index],
                                         outputs=self.error,
                                         givens={
                                             self.x: valid_set_x[index],
                                             self.y: valid_set_y[index]
                                         })
        # performance check : error rate, sensitivity, specificity, auc
        test_error_fn = theano.function(inputs=[index],
                                        outputs=self.error,
                                        givens={
                                            self.x: test_set_x[index],
                                            self.y: test_set_y[index]
                                        })
        test_sensitivity_fn = theano.function(inputs=[index],
                                              outputs=self.sensitivity,
                                              givens={
                                                  self.x: test_set_x[index],
                                                  self.y: test_set_y[index]
                                              })
        test_specificity_fn = theano.function(inputs=[index],
                                              outputs=self.specificity,
                                              givens={
                                                  self.x: test_set_x[index],
                                                  self.y: test_set_y[index]
                                              })
        test_class1_pred_fn = theano.function(inputs=[index],
                                              outputs=self.class1_pred,
                                              givens={
                                                  self.x: test_set_x[index],
                                                  self.y: test_set_y[index]
                                              })
        test_y_fn = theano.function(inputs=[index],
                                    outputs=self.y,
                                    givens={self.y: test_set_y[index]})

        n_train_exp = train_set_x.get_value(borrow=True).shape[0]
        n_valid_exp = valid_set_x.get_value(borrow=True).shape[0]
        n_test_exp = test_set_x.get_value(borrow=True).shape[0]

        def getSums(fn, n_exp, batch_size):
            val_sum = 0.
            tot_len = 0.
            n_batches = n_exp / batch_size
            resid = n_exp - (n_batches * batch_size)
            IDX = range(n_exp)
            for i in range(n_batches):
                sum_val, len_val = fn(IDX[i * batch_size:(i + 1) * batch_size])
                val_sum += sum_val
                tot_len += len_val
            if resid != 0:
                sum_val, len_val = fn(
                    IDX[n_batches * batch_size:(n_batches * batch_size) +
                        resid])
                val_sum += sum_val
                tot_len += len_val
            return val_sum / tot_len

        def getVals(fn, n_exp, batch_size):
            vals = list()
            n_batches = n_exp / batch_size
            resid = n_exp - (n_batches * batch_size)
            IDX = range(n_exp)
            for i in range(n_batches):
                vals += fn(IDX[i * batch_size:(i + 1) * batch_size]).tolist()
            if resid != 0:
                vals += fn(
                    IDX[n_batches * batch_size:(n_batches * batch_size) +
                        resid]).tolist()
            return vals

        def errorcheck():
            train_error = getSums(train_error_fn, n_train_exp, batch_size)
            valid_error = getSums(valid_error_fn, n_valid_exp, batch_size)
            return train_error, valid_error

        def performance():
            test_error = getSums(test_error_fn, n_test_exp, batch_size)
            test_sensitivity = getSums(test_sensitivity_fn, n_test_exp,
                                       batch_size)
            test_specificity = getSums(test_specificity_fn, n_test_exp,
                                       batch_size)
            test_y = getVals(test_y_fn, n_test_exp, batch_size)
            test_class1_pred = getVals(test_class1_pred_fn, n_test_exp,
                                       batch_size)
            test_roc = ROCData(zip(test_y, test_class1_pred))
            return test_error, test_sensitivity, test_specificity, test_roc

        return train_model, errorcheck, performance
def build_mlp(args, netid, input_var=None, mask_inputs=False):
    """Build MLP model"""
    # pylint: disable=bad-continuation
    # This creates an MLP of two hidden layers of 800 units each, followed by
    # a softmax output layer of 10 units. It applies 20% dropout to the input
    # data and 50% dropout to the hidden layers.

    # Input layer, specifying the expected input shape of the network
    # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and
    # linking it to the given Theano variable `input_var`, if any:

    l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
                                     input_var=input_var,
                                     name="%d_%s" % (netid, "l_in"))

    mask_in = None
    if mask_inputs:
        mask_in = T.ltensor3()
    # Apply 20% dropout to the input data:
    l_in_drop = dropout.DropoutLayer(l_in,
                                     mask=mask_in,
                                     p=args.input_dropout_rate,
                                     name="%d_%s" % (netid, "l_in_drop"))

    # Add a fully-connected layer of 800 units, using the linear rectifier, and
    # initializing weights with Glorot's scheme (which is the default anyway):
    l_hid1 = lasagne.layers.DenseLayer(
        l_in_drop,
        num_units=200,
        nonlinearity=lasagne.nonlinearities.rectify,
        W=lasagne.init.GlorotUniform(),
        name="%d_%s" % (netid, "l_hid1"))

    # We'll now add dropout of 50%:
    mask_hid1 = None
    if mask_inputs:
        mask_hid1 = T.lvector()
    l_hid1_drop = dropout.DropoutLayer(l_hid1,
                                       mask=mask_hid1,
                                       p=args.dropout_rate,
                                       name="%d_%s" % (netid, "l_hid1_drop"))

    # Another 800-unit layer:
    l_hid2 = lasagne.layers.DenseLayer(
        l_hid1_drop,
        num_units=200,
        nonlinearity=lasagne.nonlinearities.rectify,
        name="%d_%s" % (netid, "l_hid2"))

    # 50% dropout again:
    mask_hid2 = None
    if mask_inputs:
        mask_hid2 = T.lvector()
    l_hid2_drop = dropout.DropoutLayer(l_hid2,
                                       mask=mask_hid2,
                                       p=args.dropout_rate,
                                       name="%d_%s" % (netid, "l_hid2_drop"))

    # Finally, we'll add the fully-connected output layer, of 10 softmax units:
    l_out = lasagne.layers.DenseLayer(
        l_hid2_drop,
        num_units=10,
        nonlinearity=lasagne.nonlinearities.softmax,
        name="%d_%s" % (netid, "l_out"))

    masks = [mask_in, mask_hid1, mask_hid2]
    # Each layer is linked to its incoming layer(s), so we only need to pass
    # the output layer to give access to a network in Lasagne:
    return l_out, masks
Exemple #43
0
    def __init__(self, config):

        self.config = config

        batch_size = config.batch_size
        lib_conv = config.lib_conv
        group = (2 if config.grouping else 1)
        LRN = (True if config.LRN else False)
        print 'LRN, group', LRN, group

        # ##################### BUILD NETWORK ##########################
        # allocate symbolic variables for the data
        x = T.ftensor4('x')
        y = T.lvector('y')


        print '... building the model with ConvLib %s, LRN %s, grouping %i ' \
              % (lib_conv, LRN, group)
        self.layers = []
        params = []
        weight_types = []

        layer1_input = x

        convpool_layer1 = ConvPoolLayer(
            input=layer1_input,
            image_shape=((3, 224, 224,
                          batch_size) if lib_conv == 'cudaconvnet' else
                         (batch_size, 3, 227, 227)),
            filter_shape=((3, 11, 11, 96) if lib_conv == 'cudaconvnet' else
                          (96, 3, 11, 11)),
            convstride=4,
            padsize=(0 if lib_conv == 'cudaconvnet' else 3),
            group=1,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            lrn=LRN,
            lib_conv=lib_conv)
        self.layers.append(convpool_layer1)
        params += convpool_layer1.params
        weight_types += convpool_layer1.weight_type

        convpool_layer2 = ConvPoolLayer(
            input=convpool_layer1.output,
            image_shape=((96, 27, 27,
                          batch_size) if lib_conv == 'cudaconvnet' else
                         (batch_size, 96, 27, 27)),
            filter_shape=((96, 5, 5, 256) if lib_conv == 'cudaconvnet' else
                          (256, 96, 5, 5)),
            convstride=1,
            padsize=2,
            group=group,
            poolsize=3,
            poolstride=2,
            bias_init=0.1,
            lrn=LRN,
            lib_conv=lib_conv,
        )
        self.layers.append(convpool_layer2)
        params += convpool_layer2.params
        weight_types += convpool_layer2.weight_type

        convpool_layer3 = ConvPoolLayer(
            input=convpool_layer2.output,
            image_shape=((256, 13, 13,
                          batch_size) if lib_conv == 'cudaconvnet' else
                         (batch_size, 256, 13, 13)),
            filter_shape=((256, 3, 3, 384) if lib_conv == 'cudaconvnet' else
                          (384, 256, 3, 3)),
            convstride=1,
            padsize=1,
            group=1,
            poolsize=1,
            poolstride=0,
            bias_init=0.0,
            lrn=False,
            lib_conv=lib_conv,
        )
        self.layers.append(convpool_layer3)
        params += convpool_layer3.params
        weight_types += convpool_layer3.weight_type

        convpool_layer4 = ConvPoolLayer(
            input=convpool_layer3.output,
            image_shape=((384, 13, 13,
                          batch_size) if lib_conv == 'cudaconvnet' else
                         (batch_size, 384, 13, 13)),
            filter_shape=((384, 3, 3, 384) if lib_conv == 'cudaconvnet' else
                          (384, 384, 3, 3)),
            convstride=1,
            padsize=1,
            group=group,
            poolsize=1,
            poolstride=0,
            bias_init=0.1,
            lrn=False,
            lib_conv=lib_conv,
        )
        self.layers.append(convpool_layer4)
        params += convpool_layer4.params
        weight_types += convpool_layer4.weight_type

        convpool_layer5 = ConvPoolLayer(
            input=convpool_layer4.output,
            image_shape=((384, 13, 13,
                          batch_size) if lib_conv == 'cudaconvnet' else
                         (batch_size, 384, 13, 13)),
            filter_shape=((384, 3, 3, 256) if lib_conv == 'cudaconvnet' else
                          (256, 384, 3, 3)),
            convstride=1,
            padsize=1,
            group=group,
            poolsize=3,
            poolstride=2,
            bias_init=0.0,
            lrn=False,
            lib_conv=lib_conv,
        )
        self.layers.append(convpool_layer5)
        params += convpool_layer5.params
        weight_types += convpool_layer5.weight_type

        if lib_conv == 'cudaconvnet':
            fc_layer6_input = T.flatten(
                convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2)
        else:
            fc_layer6_input = convpool_layer5.output.flatten(2)

        fc_layer6 = FCLayer(input=fc_layer6_input, n_in=9216, n_out=4096)
        self.layers.append(fc_layer6)
        params += fc_layer6.params
        weight_types += fc_layer6.weight_type

        dropout_layer6 = DropoutLayer(fc_layer6.output)

        fc_layer7 = FCLayer(input=dropout_layer6.output, n_in=4096, n_out=4096)
        self.layers.append(fc_layer7)
        params += fc_layer7.params
        weight_types += fc_layer7.weight_type

        dropout_layer7 = DropoutLayer(fc_layer7.output)

        softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output,
                                      n_in=4096,
                                      n_out=1000)
        self.layers.append(softmax_layer8)
        params += softmax_layer8.params
        weight_types += softmax_layer8.weight_type

        # #################### NETWORK BUILT #######################

        self.cost = softmax_layer8.negative_log_likelihood(y)
        self.errors = softmax_layer8.errors(y)
        self.errors_top_5 = softmax_layer8.errors_top_x(y, 5)
        self.params = params
        self.x = x
        self.y = y
        # self.rand = rand
        self.weight_types = weight_types
        self.batch_size = batch_size
Exemple #44
0
    def __init__(self, config):
        ModelBase.__init__(self)

        self.config = config
        self.verbose = self.config['verbose']
        self.name = 'alexnet'
        batch_size = config['batch_size']
        flag_datalayer = config['use_data_layer']
        lib_conv = config['lib_conv']
        n_softmax_out = config['n_softmax_out']
        # ##################### BUILD NETWORK ##########################
        # allocate symbolic variables for the data
        # 'rand' is a random array used for random cropping/mirroring of data
        x = T.ftensor4('x')
        y = T.lvector('y')
        rand = T.fvector('rand')
        lr = T.scalar('lr')

        if self.verbose: print 'AlexNet 2/16'
        self.layers = []
        params = []
        weight_types = []

        if flag_datalayer:
            data_layer = DataLayer(input=x,
                                   image_shape=(3, 256, 256, batch_size),
                                   cropsize=227,
                                   rand=rand,
                                   mirror=True,
                                   flag_rand=config['rand_crop'])

            layer1_input = data_layer.output
        else:
            layer1_input = x

        convpool_layer1 = ConvPoolLayer(input=layer1_input,
                                        image_shape=(3, 227, 227, batch_size),
                                        filter_shape=(3, 11, 11, 96),
                                        convstride=4,
                                        padsize=0,
                                        group=1,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        lrn=True,
                                        lib_conv=lib_conv,
                                        verbose=self.verbose)
        self.layers.append(convpool_layer1)
        params += convpool_layer1.params
        weight_types += convpool_layer1.weight_type

        convpool_layer2 = ConvPoolLayer(input=convpool_layer1.output,
                                        image_shape=(96, 27, 27, batch_size),
                                        filter_shape=(96, 5, 5, 256),
                                        convstride=1,
                                        padsize=2,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.1,
                                        lrn=True,
                                        lib_conv=lib_conv,
                                        verbose=self.verbose)
        self.layers.append(convpool_layer2)
        params += convpool_layer2.params
        weight_types += convpool_layer2.weight_type

        convpool_layer3 = ConvPoolLayer(input=convpool_layer2.output,
                                        image_shape=(256, 13, 13, batch_size),
                                        filter_shape=(256, 3, 3, 384),
                                        convstride=1,
                                        padsize=1,
                                        group=1,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.0,
                                        lrn=False,
                                        lib_conv=lib_conv,
                                        verbose=self.verbose)
        self.layers.append(convpool_layer3)
        params += convpool_layer3.params
        weight_types += convpool_layer3.weight_type

        convpool_layer4 = ConvPoolLayer(input=convpool_layer3.output,
                                        image_shape=(384, 13, 13, batch_size),
                                        filter_shape=(384, 3, 3, 384),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=1,
                                        poolstride=0,
                                        bias_init=0.1,
                                        lrn=False,
                                        lib_conv=lib_conv,
                                        verbose=self.verbose)
        self.layers.append(convpool_layer4)
        params += convpool_layer4.params
        weight_types += convpool_layer4.weight_type

        convpool_layer5 = ConvPoolLayer(input=convpool_layer4.output,
                                        image_shape=(384, 13, 13, batch_size),
                                        filter_shape=(384, 3, 3, 256),
                                        convstride=1,
                                        padsize=1,
                                        group=2,
                                        poolsize=3,
                                        poolstride=2,
                                        bias_init=0.0,
                                        lrn=False,
                                        lib_conv=lib_conv,
                                        verbose=self.verbose)
        self.layers.append(convpool_layer5)
        params += convpool_layer5.params
        weight_types += convpool_layer5.weight_type

        fc_layer6_input = T.flatten(
            convpool_layer5.output.dimshuffle(3, 0, 1, 2), 2)
        fc_layer6 = FCLayer(input=fc_layer6_input,
                            n_in=9216,
                            n_out=4096,
                            verbose=self.verbose)
        self.layers.append(fc_layer6)
        params += fc_layer6.params
        weight_types += fc_layer6.weight_type

        dropout_layer6 = DropoutLayer(fc_layer6.output,
                                      n_in=4096,
                                      n_out=4096,
                                      verbose=self.verbose)

        fc_layer7 = FCLayer(input=dropout_layer6.output,
                            n_in=4096,
                            n_out=4096,
                            verbose=self.verbose)
        self.layers.append(fc_layer7)
        params += fc_layer7.params
        weight_types += fc_layer7.weight_type

        dropout_layer7 = DropoutLayer(fc_layer7.output,
                                      n_in=4096,
                                      n_out=4096,
                                      verbose=self.verbose)

        softmax_layer8 = SoftmaxLayer(input=dropout_layer7.output,
                                      n_in=4096,
                                      n_out=n_softmax_out,
                                      verbose=self.verbose)
        self.layers.append(softmax_layer8)
        params += softmax_layer8.params
        weight_types += softmax_layer8.weight_type

        # #################### NETWORK BUILT #######################
        self.p_y_given_x = softmax_layer8.p_y_given_x
        self.y_pred = softmax_layer8.y_pred

        self.cost = softmax_layer8.negative_log_likelihood(y)
        self.errors = softmax_layer8.errors(y)
        if n_softmax_out < 5:
            self.errors_top_5 = softmax_layer8.errors_top_x(y, n_softmax_out)
        else:
            self.errors_top_5 = softmax_layer8.errors_top_x(y, 5)
        self.params = params

        # inputs
        self.x = x
        self.y = y
        self.rand = rand
        self.lr = lr
        self.shared_x = theano.shared(
            np.zeros(
                (3, config['input_width'], config['input_height'],
                 config['file_batch_size']),  # for loading large batch
                dtype=theano.config.floatX),
            borrow=True)

        self.shared_y = theano.shared(np.zeros((config['file_batch_size'], ),
                                               dtype=int),
                                      borrow=True)
        self.shared_lr = theano.shared(np.float32(config['learning_rate']))

        # training related
        self.base_lr = np.float32(config['learning_rate'])
        self.step_idx = 0
        self.mu = config['momentum']  # def: 0.9 # momentum
        self.eta = config['weight_decay']  #0.0002 # weight decay
        self.weight_types = weight_types
        self.batch_size = batch_size

        self.grads = T.grad(self.cost, self.params)

        # shared variable for storing momentum before exchanging momentum(delta w)
        self.vels = [
            theano.shared(param_i.get_value() * 0.) for param_i in self.params
        ]

        # shared variable for accepting momentum during exchanging momentum(delta w)
        self.vels2 = [
            theano.shared(param_i.get_value() * 0.) for param_i in self.params
        ]

        self.train = None
        self.get_vel = None
        self.descent_vel = None
        self.val = None
        self.inference = None
Exemple #45
0
    def build_model(self):
        
        if self.verbose: print(self.name)

        # start graph construction from scratch
        import theano.tensor as T
        if seed_weight_on_pid:
            import theanompi.models.layers2 as layers
            import os
            layers.rng = np.random.RandomState(os.getpid())
        from theanompi.models.layers2 import (ConvPoolLRN,Dropout,FC, 
                                                Dimshuffle, Crop, Subtract,
                                                Softmax,Flatten,LRN, Constant, Normal)
        
        
        self.x = T.ftensor4('x')
        self.y = T.lvector('y')
        self.lr = T.scalar('lr')
        
        # subtract_layer = Subtract(input=self.x,
        #                           input_shape=(self.channels,
        #                                        self.data.width,
        #                                        self.data.height,
        #                                        self.batch_size),
        #                           subtract_arr = self.data.rawdata[4],
        #                           printinfo = self.verbose
        #                           )
        #
        # crop_layer = Crop(input=subtract_layer,
        #                   output_shape=(self.channels,
        #                                 self.input_width,
        #                                 self.input_height,
        #                                 self.batch_size),
        #                   flag_batch=batch_crop_mirror,
        #                   printinfo = self.verbose
        #                   )
                         
        convpool_layer1 = ConvPoolLRN(input=self.x,  #crop_layer,
                                      input_shape=(self.channels,
                                                   self.input_width,
                                                   self.input_height,
                                                   self.batch_size),
                                                     
                                        filter_shape=(3, 11, 11, 96),
                                        convstride=4, padsize=0, group=1,
                                        poolsize=3, poolstride=2,
                                        b=0.0, lrn=True,
                                        lib_conv=lib_conv,
                                        printinfo = self.verbose
                                        #output_shape = (96, 27, 27, batch_size)
                                        )

        convpool_layer2 = ConvPoolLRN(input=convpool_layer1,
                                        #input_shape=(96, 27, 27, batch_size),
                                        filter_shape=(96, 5, 5, 256),
                                        convstride=1, padsize=2, group=2,
                                        poolsize=3, poolstride=2,
                                        b=0.1, lrn=True,
                                        lib_conv=lib_conv,
                                        printinfo = self.verbose
                                        #output_shape=(256, 13, 13, batch_size),
                                        )


        convpool_layer3 = ConvPoolLRN(input=convpool_layer2,
                                        #input_shape=(256, 13, 13, batch_size),
                                        filter_shape=(256, 3, 3, 384),
                                        convstride=1, padsize=1, group=1,
                                        poolsize=1, poolstride=0,
                                        b=0.0, lrn=False,
                                        lib_conv=lib_conv,
                                        printinfo = self.verbose
                                        #output_shape=(384, 13, 13, batch_size),
                                        )

        convpool_layer4 = ConvPoolLRN(input=convpool_layer3,
                                        #input_shape=(384, 13, 13, batch_size),
                                        filter_shape=(384, 3, 3, 384),
                                        convstride=1, padsize=1, group=2,
                                        poolsize=1, poolstride=0,
                                        b=0.1, lrn=False,
                                        lib_conv=lib_conv,
                                        printinfo = self.verbose
                                        #output_shape=(384, 13, 13, batch_size),
                                        )

        convpool_layer5 = ConvPoolLRN(input=convpool_layer4,
                                        #input_shape=(384, 13, 13, batch_size),
                                        filter_shape=(384, 3, 3, 256),
                                        convstride=1, padsize=1, group=2,
                                        poolsize=3, poolstride=2,
                                        b=0.0, lrn=False,
                                        lib_conv=lib_conv,
                                        printinfo = self.verbose
                                        #output_shape=(256, 6, 6, batch_size),
                                        )
        shuffle = Dimshuffle(input=convpool_layer5,
                             new_axis_order=(3,0,1,2),
                             printinfo=self.verbose
                             )
        
        fc_layer6_input = Flatten(input=shuffle,
                                  #input_shape=(batch_size, 256, 6, 6),
                                  axis = 2,
                                  printinfo=self.verbose
                                  )
            
        fc_layer6      = FC(input=fc_layer6_input, 
                            # n_in=9216,
                            n_out=4096,
                            W=Normal((fc_layer6_input.output_shape[1], 4096), std=0.005),
                            b=Constant((4096,), val=0.1),
                            printinfo = self.verbose
                            )

        dropout_layer6 = Dropout(input=fc_layer6, 
                                  # n_in=4096,
                                  n_out=fc_layer6.output_shape[1], 
                                  prob_drop=0.5,
                                  printinfo = self.verbose)

        fc_layer7      = FC(input=dropout_layer6, 
                            # n_in=4096,
                            n_out=4096,
                            W = Normal((dropout_layer6.output_shape[1], 4096), std=0.005),
                            b = Constant((4096,), val=0.1),
                            printinfo = self.verbose
                            )

        dropout_layer7 = Dropout(input=fc_layer7, 
                                  #n_in=4096, 
                                  n_out=fc_layer7.output_shape[1],
                                  prob_drop=0.5,
                                  printinfo = self.verbose)

        softmax_layer8 = Softmax(input=dropout_layer7, 
                                      #n_in=4096, 
                                      n_out=self.n_softmax_out,
                                      W = Normal((dropout_layer7.output_shape[1], 
                                                  self.n_softmax_out), mean=0, std=0.01),
                                      b = Constant((self.n_softmax_out,),val=0),
                                      printinfo = self.verbose)
                                      
        self.output_layer = softmax_layer8
        
        self.cost = softmax_layer8.negative_log_likelihood(self.y)     
        self.error = softmax_layer8.errors(self.y)
        self.error_top_5 = softmax_layer8.errors_top_x(self.y)
Exemple #46
0
    def __theano_build__(self):
        E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b, self.c, self.W_att, self.b_att

        x_a = T.ivector('x_a')
        x_b = T.ivector('x_b')
        y = T.lvector('y')

        def forward_direction_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) +
                                      W[0].dot(s_t_prev)) + b[0]
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) +
                                      W[1].dot(s_t_prev)) + b[1]
            c_t = T.tanh(U[2].dot(x_e) + W[2].dot(s_t_prev * r_t) + b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        def backward_direction_step(x_t, s_t_prev):
            # Word embedding layer
            x_e = E[:, x_t]
            # GRU layer 2
            z_t = T.nnet.hard_sigmoid(U[3].dot(x_e) +
                                      W[3].dot(s_t_prev)) + b[3]
            r_t = T.nnet.hard_sigmoid(U[4].dot(x_e) +
                                      W[4].dot(s_t_prev)) + b[4]
            c_t = T.tanh(U[5].dot(x_e) + W[5].dot(s_t_prev * r_t) + b[5])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev
            # directly return the hidden state as intermidate output
            return [s_t]

        # sentence a vector (states) forward direction
        a_s_f, updates = theano.scan(forward_direction_step,
                                     sequences=x_a,
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) backward direction
        a_s_b, updates = theano.scan(backward_direction_step,
                                     sequences=x_a[::-1],
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) forward direction
        b_s_f, updates = theano.scan(forward_direction_step,
                                     sequences=x_b,
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) backward direction
        b_s_b, updates = theano.scan(backward_direction_step,
                                     sequences=x_b[::-1],
                                     truncate_gradient=self.bptt_truncate,
                                     outputs_info=T.zeros(self.hidden_dim))

        # combine the sena
        a_s = T.concatenate([a_s_f, a_s_b[::-1]], axis=1)
        b_s = T.concatenate([b_s_f, b_s_b[::-1]], axis=1)

        def soft_attention(h_i):
            return T.tanh(W_att.dot(h_i) + b_att)

        def weight_attention(h_i, a_j):
            return h_i * a_j

        a_att, updates = theano.scan(soft_attention, sequences=a_s)
        b_att, updates = theano.scan(soft_attention, sequences=b_s)

        # softmax
        # a_att = (59,1)
        # b_att = (58,1)
        a_att = T.exp(a_att)
        a_att = a_att.flatten()
        a_att = a_att / a_att.sum()

        b_att = T.exp(b_att)
        b_att = b_att.flatten()
        b_att = b_att / b_att.sum()

        a_s_att, updates = theano.scan(weight_attention,
                                       sequences=[a_s, a_att])
        b_s_att, updates = theano.scan(weight_attention,
                                       sequences=[b_s, b_att])
        # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX)

        # semantic similarity
        # s_sim = manhattan_distance(a_s[-1],b_s[-1])

        # for classification using simple strategy
        # for now we still use the last word vector as sentence vector
        # apply a simple single hidden layer on each word in sentence
        #
        # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b)
        # theano scan
        # exp(a)
        #
        sena = a_s_att.sum(axis=0)
        senb = b_s_att.sum(axis=0)

        combined_s = T.concatenate([sena, senb], axis=0)

        # softmax class
        o = T.nnet.softmax(V.dot(combined_s) + c)[0]

        # in case the o contains 0 which cause inf and nan
        eps = np.asarray([1.0e-10] * self.label_dim,
                         dtype=theano.config.floatX)
        o = o + eps
        om = o.reshape((1, o.shape[0]))
        prediction = T.argmax(om, axis=1)
        o_error = T.nnet.categorical_crossentropy(om, y)

        # cost
        cost = T.sum(o_error)

        # updates
        updates = sgd_updates_adadelta(norm=0, params=self.params, cost=cost)

        # monitor parameter
        mV = V * T.ones_like(V)
        mc = c * T.ones_like(c)
        mU = U * T.ones_like(U)
        mW = W * T.ones_like(W)

        gV = T.grad(cost, V)
        gc = T.grad(cost, c)
        gU = T.grad(cost, U)
        gW = T.grad(cost, W)

        mgV = gV * T.ones_like(gV)
        mgc = gc * T.ones_like(gc)
        mgU = gU * T.ones_like(gU)
        mgW = gW * T.ones_like(gW)

        # Assign functions
        self.comsen = theano.function([x_a, x_b], [a_att, b_att])
        self.monitor = theano.function([x_a, x_b],
                                       [sena, senb, mV, mc, mU, mW])
        self.monitor_grad = theano.function([x_a, x_b, y],
                                            [mgV, mgc, mgU, mgW])
        self.predict = theano.function([x_a, x_b], om)
        self.predict_class = theano.function([x_a, x_b], prediction)
        self.ce_error = theano.function([x_a, x_b, y], cost)
        # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        # find the nan
        self.sgd_step = theano.function(
            [x_a, x_b, y], [],
            updates=updates
            # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        )
Exemple #47
0
def optimization_adadelta(trainvec,
                          testvec,
                          n_epochs,
                          batch_size,
                          alpha=0.001,
                          beta=0.1):
    i = T.lvector('i')
    j = T.lvector('j')
    x = T.dvector('x')
    num_user = 6040
    num_item = 3952
    factors = 20
    init_mean = 0
    init_stdev = 0.02
    mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev)
    regcost, error = mfobj.errors(x, beta)
    grads = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q])
    #f_grad = theano.function([i, j, x], grads, name='f_grad')
    lr = T.scalar(name='lr')
    f_grad_shared, f_update = adadelta(lr, mfobj.params2, grads, i, j, x,
                                       regcost)

    test_model = theano.function(
        inputs=[i, j, x],
        #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]],
        outputs=error)

    mean_rating = np.mean(trainvec[:, 2])
    done_looping = False
    epoch = 0
    N = len(trainvec)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        totalErrors = 0
        testErrors = 0
        for k in range(int(math.floor(N / batch_size))):
            batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size))
            idi = trainvec[batch, 0] - 1
            idj = trainvec[batch, 1] - 1
            ratings = trainvec[batch, 2] - mean_rating
            batch_cost = f_grad_shared(idi, idj, ratings)
            f_update(alpha)
            totalErrors += batch_cost

        NN = len(testvec)
        batch_size = 1000
        for k in range(int(math.floor(NN / batch_size))):
            batch = np.arange(k * batch_size, min(NN - 1,
                                                  (k + 1) * batch_size))
            p_idx = testvec[batch, 0] - 1
            q_idx = testvec[batch, 1] - 1
            ratings = testvec[batch, 2] - mean_rating
            testErrors += test_model(p_idx, q_idx, ratings)
        print(
            "the training cost at epoch {} is {}, and the testing error is {}".
            format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN)))

        # test it on the test dataset
    NN = len(testvec)
    batch_size = 1000
    diff = 0
    for k in range(int(math.floor(NN / batch_size))):
        batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size))
        p_idx = testvec[batch, 0] - 1
        q_idx = testvec[batch, 1] - 1
        ratings = testvec[batch, 2] - mean_rating
        diff += test_model(p_idx, q_idx, ratings)

    print("Total average test error for {} instances is {}".format(
        NN, np.sqrt(diff / NN)))
Exemple #48
0
    def __theano_build__(self):
        E = self.E
        W = self.W
        U = self.U
        V = self.V
        b = self.b
        c = self.c

        x = T.lvector('x') #
        y = T.lvector('y') #

        def forward_prop_step(x_t, h_t_prev, c_t_prev):

            # Word embedding layer
            x_e = E[:, x_t]

            i_t = T.nnet.sigmoid(W[0].dot(x_e) + U[0].dot(h_t_prev) + b[0])
            f_t = T.nnet.sigmoid(W[1].dot(x_e) + U[1].dot(h_t_prev) + b[1])
            o_t = T.nnet.sigmoid(W[2].dot(x_e) + U[2].dot(h_t_prev) + b[2])
            u_t = T.tanh(W[3].dot(x_e) + U[3].dot(h_t_prev) + b[3])

            c_t = i_t*u_t + f_t * c_t_prev
            h_t = o_t * T.tanh(c_t)

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            # o = T.nnet.softmax(V.dot(h_t) + c)[0]
            # o = T.nnet.softmax(V[0].dot(h_t) + c)
            return [h_t, c_t]

        [h_t, c_t], updates = theano.scan(fn=forward_prop_step,
                                             sequences=x,
                                             truncate_gradient=self.bptt_truncate,
                                             outputs_info=[
                                                           dict(initial=T.zeros(self.hidden_dim)),
                                                           dict(initial=T.zeros(self.hidden_dim))
                                                           ])
        # o is an array for o[t] is output of time step t
        # we only care the output of final time step

        def forward_prop_step_b(x_t, h_t_prev_b, c_t_prev_b):
            # the backward

            # Word embedding layer
            x_e_b = E[:, x_t]

            i_t_b = T.nnet.sigmoid(W[4].dot(x_e_b) + U[4].dot(h_t_prev_b) + b[4])
            f_t_b = T.nnet.sigmoid(W[5].dot(x_e_b) + U[5].dot(h_t_prev_b) + b[5])
            o_t_b = T.nnet.sigmoid(W[6].dot(x_e_b) + U[6].dot(h_t_prev_b) + b[6])
            u_t_b = T.tanh(W[7].dot(x_e_b) + U[7].dot(h_t_prev_b) + b[7])

            c_t_b = i_t_b * u_t_b + f_t_b * c_t_prev_b
            h_t_b = o_t_b * T.tanh(c_t_b)

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            # o = T.nnet.softmax(V.dot(h_t) + c)[0]
            # o_b = T.nnet.softmax(V[1].dot(h_t) + c)
            return [h_t_b, c_t_b]

        [h_t_b, c_t_b], updates = theano.scan(fn=forward_prop_step_b,
                                                   sequences=x[::-1],
                                                   truncate_gradient=self.bptt_truncate,
                                                   outputs_info=[dict(initial=T.zeros(self.hidden_dim)),
                                                                 dict(initial=T.zeros(self.hidden_dim))])


        final_h = h_t[-1]
        final_h_b = h_t_b[-1]
        final_h_concat = T.concatenate([final_h,final_h_b], axis=0)
        final_o = T.nnet.softmax(V[0].dot(final_h_concat) + c) # a array with one row


        prediction = T.argmax(final_o[0], axis=0)
        print('final_o', final_o.ndim)
        print('y ', y.ndim)
        final_o_error = T.sum(T.nnet.categorical_crossentropy(final_o, y))

        cost = final_o_error

        # gradient
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # function
        self.predict = theano.function([x], final_o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x,y], cost)

        # SGD parameters
        learning_rate = T.scalar('learning_rate')

        self.sgd_step = theano.function([x,y,learning_rate],[],
                                        updates=[(self.U, self.U - learning_rate * dU),
                                                 (self.V, self.V - learning_rate * dV),
                                                 (self.W, self.W - learning_rate * dW),
                                                 (self.E, self.E - learning_rate * dE),
                                                 (self.b, self.b - learning_rate * db),
                                                 (self.c, self.c - learning_rate * dc)])
Exemple #49
0
                             broadcastable=param.broadcastable)
        accu_new = accu + grad**2
        updates[accu] = accu_new
        updates[param] = param - (learning_rate * grad /
                                  T.sqrt(accu_new + epsilon))
    return updates


### momentum ###

######## main #########################

if __name__ == '__main__':
    # parameters setup
    X = T.matrix('X')
    y = T.lvector('y')

    nn_input_dim = 39  # 39 features as input dim
    nn_output_dim = 48  # 48 categorical phones
    hidden_shape = [128, 128]  # shape of hidden layer
    nn_shape = [nn_input_dim] + hidden_shape + [nn_output_dim
                                                ]  # [39,1000,1000,48] nn shape

    ### initialize weight/bias ###
    W = {}
    b = {}  # weight and bias
    layers = range(len(nn_shape) - 1)  # [0,1,2]
    for layer in layers:
        shape = nn_shape[layer:layer + 2]  # shape of this layer
        dim = nn_shape[layer + 1]  # dim of bias
        W[layer] = init_weight(shape, index=layer)
Exemple #50
0
    def __init__(self, batch_size, train_set,initial_weights=None,weigths_service = None): # lowAndHigh_c1Values= [-0.3,0.3],lowAndHigh_c3Values = [-0.1,0.1],lowAndHigh_fc5Values = [-0.01,0.01],lowAndHigh_fc6Values = [-0.001,0.001]):
        x = T.tensor4('x')  # the data is presented as rasterized images
        y = T.lvector('y')

        # batch_size = 50000
        # img_input =  x #T.reshape(x,(batch_size, 1, 28, 28))
        self.cnn = arqui.OCRLenetArquitecture(
            img_input=x,
            batch_size=batch_size,
            initWeights=initial_weights,
            weigths_service = weigths_service
            )


        # the cost we minimize during training is the NLL of the model
        cost = self.cnn.LR6.negative_log_likelihood(y)
        errors = self.cnn.LR6.errors(y)

        self.Weights = [self.cnn.LR6.Filter, self.cnn.LR6.Bias, self.cnn.FC5.Filter, self.cnn.FC5.Bias, self.cnn.C3.Filter, self.cnn.C1.Filter]

        grads = T.grad(cost, self.Weights, disconnected_inputs="raise")


        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i], grads[i]) pairs.

        learningRate = T.dscalar()

        updates = [
            (param_i, param_i + (learningRate * grad_i))
            for param_i, grad_i in zip(self.Weights, grads)
            ]

        trainset_x = theano.shared(train_set[0])
        trainset_y = theano.shared(train_set[1])

        index = T.lscalar()

        #bs = T.lscalar()

        self.train_model = theano.function(
            [index, learningRate],
            cost,  # self.classifier.FC.p_y_given_x,#dropout.output
            updates=updates,
            givens={
                x: trainset_x[index * batch_size: (index + 1) * batch_size],
                y: trainset_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        self.evaluation_model_with_cost = theano.function(
            [index],
            cost,  # self.classifier.FC.p_y_given_x,#dropout.output
            givens={
                x: trainset_x[index * batch_size: (index + 1) * batch_size],
                y: trainset_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        self.evaluation_model_with_errors = theano.function(
            [index],
            errors,
            givens={
                x: trainset_x[index * batch_size: (index + 1) * batch_size],
                y: trainset_y[index * batch_size: (index + 1) * batch_size]
            }
        )
 def __init__(self):
     super(M1, self).__init__()
     self.a = T.dscalar()
     self.b = T.lscalar()
     self.c = T.lvector()
Exemple #52
0
def create_train_rbm(learning_rate=1e-3,
                     training_epochs=200,
                     dataset=None,
                     seqlen=None,
                     batch_size=10,
                     n_hidden=30):
    """
    Demonstrate how to train a RBM
    :param learning_rate: learning rate used for training the RBM
    :param training_epochs: number of epochs used for training
    :param dataset: path the the pickled dataset
    :param batch_size: size of a batch used to train the RBM
    :param n_chains: number of parallel Gibbs chains to be used for sampling
    :param n_samples: number of samples to plot for each chain
    """
    # compute number of minibatches for training, validation and testing
    n_train_batches = int(
        dataset.get_value(borrow=True).shape[0] /
        batch_size)  #number of minibatch
    n_dim = dataset.get_value(
        borrow=True).shape[1]  #number of data in each frames .

    # allocate symbolic variables for the data
    index = T.lvector()  # list of index : shuffle data
    x = T.matrix('x')  # the data : matrix cos to minibatch?

    # initialize storage for the persistent chain (state = hidden
    # layer of chain)
    persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),
                                                 dtype=theano.config.floatX),
                                     borrow=True)

    # construct the RBM class
    rbm = RBM(input=x, n_visible=n_dim, n_hidden=n_hidden)

    # get the cost and the gradient corresponding to one step of CD-15
    cost, updates = rbm.get_cost_updates(lr=learning_rate,
                                         persistent=persistent_chain,
                                         k=1)

    #################################
    #     Training the RBM          #
    #################################

    # it is ok for a theano function to have no output
    # the purpose of train_rbm is solely to update the RBM parameters
    train_rbm = theano.function(
        [index],  #minibatch index
        cost,
        updates=updates,
        givens={x: dataset[index]},  #for the [index]minibatch
        name='train_rbm')

    plotting_time = 0.
    start_time = timeit.default_timer()
    #shuffle data : Learn gesture after gesture could introduce a bias. In order to avoid this,
    #we shuffle data to learn all gestures at the same time
    datasetindex = []
    last = 0
    for s in seqlen:
        datasetindex += range(last, last + s)
        last += s
    permindex = numpy.array(datasetindex)
    rbm.numpy_rng.shuffle(permindex)

    #In order visualize cost evolution during training phase
    cost_y = []
    # go through training epochs
    for epoch in range(training_epochs):
        # go through the training set
        mean_cost = []
        for batch_index in range(int(n_train_batches)):  #for each minibatch
            data_idx = permindex[
                batch_index * batch_size:(batch_index + 1) *
                batch_size]  #get a list of index in the shuffle index-list
            mean_cost += [train_rbm(data_idx)]
        print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost))
        cost_y.append(numpy.mean(mean_cost))

    end_time = timeit.default_timer()
    pretraining_time = (end_time - start_time) - plotting_time
    print('RBM : Training took %f minutes' % (pretraining_time / 60.))
    return rbm, cost_y
Exemple #53
0
    def init_function(self):

        self.seq_loc = T.lvector()
        self.seq_idx = T.lvector()
        self.target = T.lvector()
        self.target_content_index = T.lscalar()
        self.seq_len = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)

        self.all_tar_vector = T.take(self.Vw, self.target, axis=0)
        self.tar_vector = T.mean(self.all_tar_vector, axis=0)
        self.target_vector_dim = self.tar_vector.dimshuffle('x', 0)
        self.seq_matrix = T.concatenate([self.seq_matrix[0:self.target_content_index], self.target_vector_dim,
                                         self.seq_matrix[self.target_content_index + 1:]], axis=0)
        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc,
                                                                               dtype=theano.config.floatX)

        def rnn(X, aspect):
            def encode_forward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            def encode_backward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            loc_for = T.zeros_like(self.seq_loc) + self.target_content_index
            al_for = self.a_for_left * T.exp(
                -self.b_for_left * T.abs_(
                    self.seq_loc[0:self.target_content_index] - loc_for[0:self.target_content_index]))
            am_for = self.a_for_middle * [1]
            a_for = T.concatenate([al_for, am_for])
            locate_for = T.zeros_like(self.seq_matrix[0:self.target_content_index + 1],
                                      dtype=T.config.floatX) + T.reshape(a_for, [-1, 1])
            loc_back = T.zeros_like(self.seq_loc) + self.target_content_index
            ar_back = self.a_back_right * T.exp(
                -self.b_back_right * T.abs_(
                    self.seq_loc[self.target_content_index + 1:] - loc_back[self.target_content_index + 1:]))
            ar_back = ar_back[::-1]
            a_back = T.concatenate([am_for, ar_back])
            locate_back = T.zeros_like(self.seq_matrix[self.target_content_index:], dtype=T.config.floatX) + T.reshape(
                a_back, [-1, 1])

            scan_result_forward, _forward = theano.scan(fn=encode_forward,
                                                        sequences=locate_for * X[0:self.target_content_index + 1],
                                                        outputs_info=[h, c])
            scan_result_backward, _backward = theano.scan(fn=encode_backward,
                                                          sequences=locate_back * X[self.target_content_index:][::-1],
                                                          outputs_info=[h, c])
            embedding_l = scan_result_forward[0]
            embedding_r = scan_result_backward[0]
            h_target_for = embedding_l[-1]
            h_target_back = embedding_r[-1]

            attention_h_target_l = embedding_l
            cont_l = T.concatenate([h_target_for, h_target_back])
            yuyi_l = T.transpose(cont_l)
            alpha_h_l = T.dot(T.dot(attention_h_target_l, self.alpha_h_W_L), yuyi_l)
            alpha_tmp_l = T.nnet.softmax(alpha_h_l)
            r_l = T.dot(alpha_tmp_l, embedding_l)
            h_star_L = T.tanh(T.dot(r_l, self.Wp_L))

            attention_h_target_r = embedding_r
            cont_r = T.concatenate([h_target_for, h_target_back])
            yuyi_r = T.transpose(cont_r)

            alpha_h_r = T.dot(T.dot(attention_h_target_r, self.alpha_h_W_R), yuyi_r)
            alpha_tmp_r = T.nnet.softmax(alpha_h_r)
            r_r = T.dot(alpha_tmp_r, embedding_r)
            h_star_R = T.tanh(T.dot(r_r, self.Wp_R))
            embedding = T.concatenate([h_star_L, h_star_R],
                                      axis=1)
            return embedding

        embedding = rnn(self.seq_matrix, self.tar_vector)
        embedding_for_train = embedding * self.srng.binomial(embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5

        self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param ** 2) for param in self.params]) - T.sum(self.Vw ** 2)
        self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                                         dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
            inputs=[self.seq_idx, self.target, self.solution,
                    self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')

        self.func_test = theano.function(
            inputs=[self.seq_idx, self.target, self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=self.pred_for_test,
            on_unused_input='warn')
def main():
    ##########
    # LAYERS #
    #########
    HOME_DIR = "semeval_parsed"
    timestamp = str(long(time.time() * 1000))
    input_fname = '200M'
    embedding = 'custom'

    data_dir = HOME_DIR + '_' + input_fname
    numpy_rng = numpy.random.RandomState(123)
    print "Load Parameters"
    parameter_map = cPickle.load(
        open(data_dir + '/parameters_distant_winner.p', 'rb'))
    input_shape = parameter_map['inputShape']
    filter_width = parameter_map['filterWidth']
    n_in = parameter_map['n_in']
    st = parameter_map['st']

    fname_wordembeddings = os.path.join(
        data_dir, 'emb_smiley_tweets_embedding_topic.npy')
    print "Loading word embeddings from", fname_wordembeddings
    vocab_emb_overlap = numpy.load(fname_wordembeddings)
    ndim = vocab_emb_overlap.shape[1]

    ndim = 5
    fname_vocab = os.path.join(data_dir, 'vocab_{}.pickle'.format('topic'))
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_id = alphabet.fid
    vocab_emb_overlap = (numpy_rng.randn(dummy_word_id + 1, ndim) *
                         0.25).astype(numpy.float32)

    def relu(x):
        return x * (x > 0)

    activation = relu

    tweets = T.imatrix('tweets_train')
    topics = T.imatrix('topics')
    y = T.lvector('y')
    batch_tweets = T.imatrix('batch_x_q')
    batch_topics = T.imatrix('batch_top')
    batch_y = T.lvector('batch_y')

    lookup_table_words = nn_layers.LookupTableFastStatic(
        W=parameter_map['LookupTableFastStaticW'].get_value(),
        pad=filter_width - 1)

    lookup_table_topic = nn_layers.LookupTableFast(W=vocab_emb_overlap,
                                                   pad=filter_width - 1)

    lookup_table = nn_layers.ParallelLookupTable(
        layers=[lookup_table_words, lookup_table_topic])

    filter_shape = parameter_map['FilterShape' + str(filter_width)]
    filter_shape = (filter_shape[0], filter_shape[1], filter_shape[2],
                    filter_shape[3] + ndim)

    input_shape = (input_shape[0], input_shape[1], input_shape[2],
                   input_shape[3] + ndim)

    conv_layers = []

    fan_in = numpy.prod(filter_shape[1:])
    fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])
    W_bound = numpy.sqrt(1. / fan_in)
    W_data = numpy.asarray(numpy_rng.uniform(low=-W_bound,
                                             high=W_bound,
                                             size=(filter_shape[0],
                                                   filter_shape[1],
                                                   filter_shape[2], ndim)),
                           dtype=theano.config.floatX)

    W_map = parameter_map['Conv2dLayerW' + str(filter_width)].get_value()

    print W_map.shape
    print W_data.shape
    W_data = numpy.concatenate((W_map, W_data), axis=3)

    conv = nn_layers.Conv2dLayer(W=theano.shared(W_data,
                                                 name="W_conv1d",
                                                 borrow=True),
                                 rng=numpy_rng,
                                 filter_shape=filter_shape,
                                 input_shape=input_shape)

    non_linearity = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB' + str(filter_width)],
        b_size=filter_shape[0],
        activation=activation)
    shape1 = parameter_map['PoolingShape1']
    pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,
                                            ignore_border=True,
                                            st=st)

    input_shape2 = parameter_map['input_shape2' + str(filter_width)]
    filter_shape2 = parameter_map['FilterShape2' + str(filter_width)]

    con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2' +
                                                 str(filter_width)],
                                 rng=numpy_rng,
                                 input_shape=input_shape2,
                                 filter_shape=filter_shape2)

    non_linearity2 = nn_layers.NonLinearityLayer(
        b=parameter_map['NonLinearityLayerB2' + str(filter_width)],
        b_size=filter_shape2[0],
        activation=activation)

    shape2 = parameter_map['PoolingShape2']
    pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, ignore_border=True)

    conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(
        layers=[conv, non_linearity, pooling, con2, non_linearity2, pooling2])

    conv_layers.append(conv2dNonLinearMaxPool)

    join_layer = nn_layers.ParallelLayer(layers=conv_layers)
    flatten_layer = nn_layers.FlattenLayer()

    hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'],
                                         b=parameter_map['LinearLayerB'],
                                         rng=numpy_rng,
                                         n_in=n_in,
                                         n_out=n_in,
                                         activation=activation)

    n_outs = 2
    classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs)

    nnet_tweets = nn_layers.FeedForwardNet(layers=[
        lookup_table, join_layer, flatten_layer, hidden_layer, classifier
    ])

    inputs_train = [batch_tweets, batch_topics, batch_y]
    givens_train = {tweets: batch_tweets, topics: batch_topics, y: batch_y}

    inputs_pred = [batch_tweets, batch_topics]
    givens_pred = {tweets: batch_tweets, topics: batch_topics}

    nnet_tweets.set_input((tweets, topics))
    print nnet_tweets

    params = nnet_tweets.params
    cost = nnet_tweets.layers[-1].training_cost(y)
    predictions = nnet_tweets.layers[-1].y_pred

    updates = sgd_trainer.get_adadelta_updates(cost,
                                               params,
                                               rho=0.95,
                                               eps=1e-6,
                                               max_norm=0,
                                               word_vec_name='None')

    train_fn = theano.function(
        inputs=inputs_train,
        outputs=cost,
        updates=updates,
        givens=givens_train,
    )

    pred_fn = theano.function(inputs=inputs_pred,
                              outputs=predictions,
                              givens=givens_pred)

    def predict_batch(batch_iterator):
        preds = numpy.hstack([
            pred_fn(batch_x_q, batch_topics)
            for (batch_x_q, batch_topics) in batch_iterator
        ])
        return preds[:batch_iterator.n_samples]

    #######################

    # Supervised Learining#
    ######################
    batch_size = 1000

    training_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tids.npy'))
    training_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.tweets.npy'))
    training_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.sentiments.npy'))
    training_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-train-2016.topics.npy'))

    dev_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tids.npy'))
    dev_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.tweets.npy'))
    dev_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.sentiments.npy'))
    dev_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-dev-2016.topics.npy'))

    devtest_2016_tids = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tids.npy'))
    devtest_2016_tweets = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.tweets.npy'))
    devtest_2016_sentiments = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.sentiments.npy'))
    devtest_2016_topics = numpy.load(
        os.path.join(data_dir, 'task-BD-devtest-2016.topics.npy'))

    test_2016_tids = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tids.npy'))
    test_2016_tweets = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.tweets.npy'))
    test_2016_topics = numpy.load(
        os.path.join(data_dir, 'SemEval2016-task4-test.subtask-BD.topics.npy'))

    training_full_tweets = numpy.concatenate(
        (training_2016_tweets, dev_2016_tweets), axis=0)
    training_full_sentiments = numpy.concatenate(
        (training_2016_sentiments, dev_2016_sentiments), axis=0)
    training_full_topics = numpy.concatenate(
        (training_2016_topics, dev_2016_topics), axis=0)

    train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng,
        [training_full_tweets, training_full_topics, training_full_sentiments],
        batch_size=batch_size,
        randomize=True)

    devtest2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [devtest_2016_tweets, devtest_2016_topics],
        batch_size=batch_size,
        randomize=False)

    test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
        numpy_rng, [test_2016_tweets, test_2016_topics],
        batch_size=batch_size,
        randomize=False)

    W_emb_list = [w for w in params if w.name == 'W_emb']
    zerout_dummy_word = theano.function([],
                                        updates=[(W,
                                                  T.set_subtensor(W[-1:], 0.))
                                                 for W in W_emb_list])

    epoch = 0
    n_epochs = 100
    early_stop = 20
    check_freq = 4
    timer_train = time.time()
    no_best_dev_update = 0
    best_dev_acc = -numpy.inf
    num_train_batches = len(train_set_iterator)
    while epoch < n_epochs:
        timer = time.time()
        for i, (tweet, topic,
                y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1):
            train_fn(tweet, topic, y_label)

            if i % check_freq == 0 or i == num_train_batches:
                y_pred_devtest_2016 = predict_batch(devtest2016_iterator)
                dev_acc_2016_devtest = semeval_f1_taskB(
                    devtest_2016_sentiments, y_pred_devtest_2016)

                if dev_acc_2016_devtest > best_dev_acc:
                    print(
                        'devtest 2016 epoch: {} chunk: {} best_chunk_auc: {:.4f}; best_dev_acc: {:.4f}'
                        .format(epoch, i, dev_acc_2016_devtest, best_dev_acc))

                    best_dev_acc = dev_acc_2016_devtest
                    best_params = [
                        numpy.copy(p.get_value(borrow=True)) for p in params
                    ]
                    no_best_dev_update = 0

                    #cPickle.dump(parameter_map, open(data_dir+'/parameters_{}.p'.format('supervised_posneg'), 'wb'))
                    y_pred_test_2016 = predict_batch(test2016_iterator)
                    numpy.save(data_dir + '/predictions_test_2016',
                               y_pred_test_2016)
                    numpy.save(data_dir + '/predictions_devtest2016',
                               y_pred_devtest_2016)

        zerout_dummy_word()

        print('epoch {} took {:.4f} seconds'.format(epoch,
                                                    time.time() - timer))
        epoch += 1
        no_best_dev_update += 1
        if no_best_dev_update >= early_stop:
            print "Quitting after of no update of the best score on dev set", no_best_dev_update
            break

    print('Training took: {:.4f} seconds'.format(time.time() - timer_train))
    for i, param in enumerate(best_params):
        params[i].set_value(param, borrow=True)

    #######################
    # Get Sentence Vectors#
    ######################

    batch_size = input_shape[0]

    inputs_senvec = [batch_tweets, batch_topics]
    givents_senvec = {tweets: batch_tweets, topics: batch_topics}

    output = nnet_tweets.layers[-2].output

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    sets = [(dev_2016_tids, dev_2016_topics, dev_2016_tweets,
             'task-BD-dev-2016'),
            (training_2016_tids, training_2016_topics, training_2016_tweets,
             'task-BD-train-2016'),
            (devtest_2016_tids, devtest_2016_topics, devtest_2016_tweets,
             'task-BD-devtest-2016'),
            (test_2016_tids, test_2016_topics, test_2016_tweets,
             'SemEval2016-task4-test.subtask-BD')]
    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir, 'sentence_vecs_topic/{}.txt'.format(name)),
            'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                fname.write(fids[counter])
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break

    ##############################
    # Get Predictions Probabilites#
    #############################

    batch_size = input_shape[0]

    output = nnet_tweets.layers[-1].p_y_given_x

    output_fn = function(inputs=inputs_senvec,
                         outputs=output,
                         givens=givents_senvec)

    for (fids, ftop, fset, name) in sets:
        test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize(
            numpy_rng, [fset, ftop], batch_size=batch_size, randomize=False)

        counter = 0
        fname = open(
            os.path.join(data_dir,
                         'prob_predictions_topic/{}.txt'.format(name)), 'w+')
        for i, (tweet, topic) in enumerate(tqdm(test_set_iterator), 1):
            o = output_fn(tweet, topic)
            for vec in o:
                for el in numpy.nditer(vec):
                    fname.write(" %f" % el)
                fname.write("\n")
                counter += 1
                if counter == test_set_iterator.n_samples:
                    break
Exemple #55
0
weights_init = IsotropicGaussian(0.01)
biases_init = Constant(0.001)

# ==========================================================================================
#                                          THE MODEL
# ==========================================================================================

print('Building model ...')

bricks = []
dropout_locs = []

#       THEANO INPUT VARIABLES
eeg = tensor.tensor3('eeg')  # batch x time x feature
acc = tensor.tensor3('acc')  # batch x time x feature
label = tensor.lvector('label')  # batch

eeg_len = 150 * 25
acc_len = 150
acc_chan = 3


def normalize(var, axis):
    var = var - var.mean(axis=axis, keepdims=True)
    var = var / tensor.sqrt((var**2).mean(axis=axis, keepdims=True))
    return var


eeg1 = normalize(eeg, axis=0)
eeg2 = normalize(eeg, axis=1)
eeg3 = normalize(eeg1, axis=1)
Exemple #56
0
def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(rng,
                           n_in=state['n_in'],
                           n_hids=eval(state['inp_nhids']),
                           activation=eval(state['inp_activ']),
                           init_fn='sample_weights_classic',
                           weight_noise=state['weight_noise'],
                           rank_n_approx=state['rank_n_approx'],
                           scale=state['inp_scale'],
                           sparsity=state['inp_sparse'],
                           learn_bias=True,
                           bias_scale=eval(state['inp_bias']),
                           name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
        rng,
        eval(state['nhids']),
        activation=eval(state['rec_activ']),
        #activation = 'TT.nnet.sigmoid',
        bias_scale=eval(state['rec_bias']),
        scale=eval(state['rec_scale']),
        sparsity=eval(state['rec_sparse']),
        init_fn=eval(state['rec_init']),
        weight_noise=state['weight_noise'],
        name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb,
                    n_steps=x.shape[0],
                    init_state=h0 * reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### Hidden state -> Intermediate Layer
    emb_state = MultiLayer(rng,
                           n_in=eval(state['nhids'])[-1],
                           n_hids=eval(state['dout_nhid']),
                           activation=linear,
                           init_fn=eval(state['dout_init']),
                           weight_noise=state['weight_noise'],
                           scale=state['dout_scale'],
                           sparsity=state['dout_sparse'],
                           learn_bias=True,
                           bias_scale=eval(state['dout_bias']),
                           name='emb_state')

    #### Exercise (1)
    #### Input -> Intermediate Layer
    emb_words_out = MultiLayer(rng,
                               n_in=state['n_in'],
                               n_hids=eval(state['dout_nhid']),
                               activation=linear,
                               init_fn='sample_weights_classic',
                               weight_noise=state['weight_noise'],
                               scale=state['dout_scale'],
                               sparsity=state['dout_sparse'],
                               rank_n_approx=state['dout_rank_n_approx'],
                               learn_bias=False,
                               bias_scale=eval(state['dout_bias']),
                               name='emb_words_out')

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    outhid_activ = UnaryOp(activation=eval(state['dout_activ']))
    #### Exercise (2)
    outhid_dropout = DropOp(dropout=state['dropout'], rng=rng)

    #### Softmax Layer
    output_layer = SoftmaxLayer(rng,
                                eval(state['dout_nhid']),
                                state['n_out'],
                                scale=state['out_scale'],
                                bias_scale=state['out_bias_scale'],
                                init_fn="sample_weights_classic",
                                weight_noise=state['weight_noise'],
                                sparsity=state['out_sparse'],
                                sum_over_time=True,
                                name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(rng,
                              n_in=state['n_in'],
                              n_hids=eval(state['inpout_nhids']),
                              activations=eval(state['inpout_activ']),
                              init_fn='sample_weights_classic',
                              weight_noise=state['weight_noise'],
                              scale=eval(state['inpout_scale']),
                              sparsity=eval(state['inpout_sparse']),
                              learn_bias=eval(state['inpout_learn_bias']),
                              bias_scale=eval(state['inpout_bias']),
                              name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']

    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'],
                      int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr'] / (1 + time / obj.state['lr_beta'])
            obj.lr = new_lr

    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    outhid = outhid_activ(emb_state(rec_layer) + emb_words_out(x))
    ##### Exercise (2): Apply Dropout
    outhid = outhid_dropout(outhid)

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(
                                   target=y,
                                   scale=numpy.float32(1. / state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(
        numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps=x.shape[0],
                    batch_size=1,
                    init_state=h0val * reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1]

    ##### Exercise (1): Compute the output intermediate layer
    outhid = outhid_activ(emb_state(rec_layer) + emb_words_out(x))
    ##### Exercise (2): Apply Dropout
    outhid = outhid_dropout(outhid, use_noise=False)

    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs = [rec_layer]
    valid_model = output_layer(outhid,
                               additional_inputs=additional_inputs,
                               use_noise=False).validate(target=y,
                                                         sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x, y, reset],
                               valid_model.out,
                               name='valid_fn',
                               updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise=False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(
            emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True),
            one_step=True),
                                use_noise=False,
                                one_step=True)
        word = output_layer.get_sample(state_below=outhid,
                                       additional_inputs=[h0],
                                       temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                                         states=[
                                             TT.alloc(numpy.int64(0),
                                                      state['sample_steps']),
                                             TT.alloc(numpy.float32(0), 1,
                                                      eval(state['nhids'])[-1])
                                         ],
                                         n_steps=state['sample_steps'],
                                         name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']

    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(cost_layer=train_model,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     clean_before_noise_fn=False,
                     noise_fn=None,
                     rng=rng)

    if state['reload']:
        model.load(state['prefix'] + 'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost=False,
                    hooks=hook_fn,
                    validate_postprocess=eval(state['validate_postprocess']))
    ## Run!
    main.main()
    def init_function(self):
        self.seq_idx = T.lvector()
        self.tar_scalar = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)
        self.tar_vector = T.take(self.Va, self.tar_scalar, axis=0)

        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(
            self.bc, dtype=theano.config.floatX)

        def encode(x_t, h_fore, c_fore, tar_vec):
            v = T.concatenate([h_fore, x_t, tar_vec])
            f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
            i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
            o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
            c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        scan_result, _ = theano.scan(fn=encode,
                                     sequences=[self.seq_matrix],
                                     outputs_info=[h, c],
                                     non_sequences=[self.tar_vector])
        embedding = scan_result[
            0]  # embedding in there is a matrix, include[h_1, ..., h_n]

        # attention
        matrix_aspect = T.zeros_like(
            embedding,
            dtype=theano.config.floatX)[:, :self.dim_aspect] + self.tar_vector
        hhhh = T.concatenate(
            [T.dot(embedding, self.Wh),
             T.dot(matrix_aspect, self.Wv)], axis=1)
        M_tmp = T.tanh(hhhh)
        alpha_tmp = T.nnet.softmax(T.dot(M_tmp, self.w))
        r = T.dot(alpha_tmp, embedding)
        h_star = T.tanh(T.dot(r, self.Wp) + T.dot(embedding[-1], self.Wx))
        embedding = h_star  # embedding in there is a vector, represent h_n_star

        # dropout
        embedding_for_train = embedding * self.srng.binomial(
            embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5

        self.pred_for_train = T.nnet.softmax(
            T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(
            T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param**2)
                       for param in self.params]) - T.sum(self.Vw**2)
        self.loss_sen = -T.tensordot(
            self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.7 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                                         dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
            inputs=[
                self.seq_idx, self.tar_scalar, self.solution,
                theano.In(h, value=self.h0),
                theano.In(c, value=self.c0)
            ],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')

        self.func_test = theano.function(inputs=[
            self.seq_idx, self.tar_scalar,
            theano.In(h, value=self.h0),
            theano.In(c, value=self.c0)
        ],
                                         outputs=self.pred_for_test,
                                         on_unused_input='warn')
Exemple #58
0
                                                feedback_dim=alphabet_size,
                                                name="feedback"),
                  name="readout")

seq_gen = SequenceGenerator(readout=readout,
                            transition=rnn,
                            weights_init=IsotropicGaussian(0.01),
                            biases_init=Constant(0),
                            name="generator")

seq_gen.push_initialization_config()
rnn.weights_init = Orthogonal()
seq_gen.initialize()

# z markov_tutorial
x = tensor.lvector('features')
x = x.reshape((x.shape[0], 1))
cost = aggregation.mean(seq_gen.cost_matrix(x[:, :]).sum(), x.shape[1])
cost.name = "sequence_log_likelihood"
cost_cg = ComputationGraph(cost)

# theano.printing.pydotprint(cost, outfile="./pics/symbolic_graph_unopt.png", var_with_name_simple=True)

algorithm = GradientDescent(cost=cost,
                            parameters=list(
                                Selector(seq_gen).get_parameters().values()),
                            step_rule=Scale(0.001))

# AUDIOSCOPE OBSERVABLES (some)
observables = []
observables += cost_cg.outputs
Exemple #59
0
    def _build_functions(self):
        """ Create Theano functions that underly higher level functionality.

            None of the created functions should be used directly by the user.
        """
        if self.cost == log_likelihood:
            target = T.lvector()
        else:
            target = tensor(1 + len(self.layers[-1].layer_shape))
        if self.cost == log_likelihood:
            cost = -T.sum(
                T.log(self.symb_output)[T.arange(target.shape[0]), target])
        elif self.cost == mse:
            cost = T.sum((self.symb_output - target)**2)
        else:
            raise "unsupported cost function"
        # Feedforward an input.
        self._feedforward_fs = []
        for y in self.layer_ys:
            self._feedforward_fs += [function([self.symb_input], y)]
        self._feedforward = function([self.symb_input], self.symb_output)

        #Introspection!
        self.layer_infos = []
        to_compute = []

        def add_compute(p):
            to_compute.append(p)
            n = add_compute.n
            add_compute.n += 1
            return n

        add_compute.n = 0
        for layer, y, n in zip(self.layers, self.layer_ys, range(100)):
            param_name_count = 0

            def get_param_name(param):
                if param == layer.W: return "W"
                if param == layer.b: return "b"
                name = "param_" + str(param_name_count)
                return name

            info = {
                "name": str(n) + "_" + layer.__class__.__name__,
                "compute_names": ["y", "y_grad"],
                "compute_ns": [add_compute(y),
                               add_compute(T.grad(cost, y))]
            }
            if layer.activation and layer.activation == sigmoid:
                info["activation"] = "sigmoid"
            elif layer.activation and layer.activation == ReLU:
                info["activation"] = "ReLU"
            elif layer.activation and layer.activation == linear:
                info["activation"] = "linear"
            else:
                info["activation"] = None

            for p in layer.params or []:
                p_name = get_param_name(p)
                info["compute_names"].append(p_name)
                info["compute_ns"].append(add_compute(p))
                info["compute_names"].append(p_name + "_grad")
                info["compute_ns"].append(add_compute(T.grad(cost, p)))
            self.layer_infos.append(info)
        self._complete_introspect = function([self.symb_input, target],
                                             to_compute)

        #Test performance on some input vs target answer
        self._test_cost = function([self.symb_input, target], cost)
        aug_params = [x for l in self.layers for x in l.aug_params]

        # Scale parameter momentum
        scale_constant = T.scalar()
        self._scale_param_momentum = function([scale_constant], [],
                                              updates=[
                                                  (x.momentum,
                                                   scale_constant * x.momentum)
                                                  for x in aug_params
                                              ])

        # Weight decay
        decay_constant = T.scalar()
        self._scale_weights = function([decay_constant], [],
                                       updates=[(x.var, decay_constant * x.var)
                                                for x in aug_params])
        # Update momentum based on cost gradient for given learning rate, input and target.
        learning_rate = T.scalar()
        self._momentum_deriv_add = function(
            [self.symb_input, target, learning_rate], [],
            updates=[(x.momentum,
                      x.momentum - learning_rate * T.grad(cost, x.var))
                     for x in aug_params])
        learning_rates = [
            T.scalar() if l.params else None for l in self.layers
        ]
        used_learning_rates = filter(lambda x: x != None, learning_rates)
        self._momentum_deriv_add_perlayer = function(
            [self.symb_input, target] + used_learning_rates, [],
            updates=[(x.momentum,
                      x.momentum - learning_rate_ * T.grad(cost, x.var))
                     for learning_rate_, l in zip(learning_rates, self.layers)
                     for x in l.aug_params])
        # NORMALIZED SGD: Update momentum based on cost gradient for given learning rate, input and target.
        self._momentum_deriv_add_normalized = function(
            [self.symb_input, target, learning_rate], [],
            updates=[(x.momentum, x.momentum - learning_rate *
                      T.sqrt(float(x.size)) * norm_L2(T.grad(cost, x.var)))
                     for x in aug_params])
        #print [product(x.shape) for x in aug_params]
        self._momentum_deriv_add_perlayer_normalized = function(
            [self.symb_input, target] + used_learning_rates, [],
            updates=[(x.momentum, x.momentum - learning_rate *
                      T.sqrt(float(x.size)) * norm_L2(T.grad(cost, x.var)))
                     for learning_rate, l in zip(learning_rates, self.layers)
                     for x in l.aug_params])
        # Update parameters based on paramter momentum
        self._learn = function([], [],
                               updates=[(x.var, x.var + x.momentum)
                                        for x in aug_params])
        # Nesterov method
        self._nesterov_reset_base = function([], [],
                                             updates=[(x.base, x.var)
                                                      for x in aug_params])
        self._nesterov_set_params = function([], [],
                                             updates=[(x.var,
                                                       x.base + x.momentum)
                                                      for x in aug_params])
        self._nesterov_learn = function([], [],
                                        updates=[(x.base, x.base + x.momentum)
                                                 for x in aug_params])

        # accuracy!
        guesses = T.argmax(self.symb_output, axis=1)
        if self.cost == log_likelihood:
            ans = target
            self._av_correct_confidence = function(
                [self.symb_input, target],
                T.mean(self.symb_output[T.arange(target.shape[0]), target]))
        else:
            ans = T.argmax(target, axis=1)
        self._corrects = theano.function([self.symb_input, target],
                                         T.sum(T.eq(guesses, ans)))

        self._inspect_grad = function(
            [self.symb_input, target],
            [T.grad(cost, x.var) for x in aug_params])
Exemple #60
0
    def __init__(
            self,
            input_dims,
            input_num_chars,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            lm=None,
            character_map=None,
            bidir=True,
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=True,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=1,
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_dims=input_dims,
                              input_num_chars=input_num_chars,
                              name='bottom',
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample,
                          bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded,
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded,
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(
                num_phonemes + 1, dim_dec
                if dim_output_embedding is None else dim_output_embedding)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
            if lm:
                # In case we use LM it is Readout that is responsible
                # for normalization.
                emitter = LMEmitter()
        elif criterion['name'].startswith('mse'):
            emitter = RewardRegressionEmitter(criterion['name'],
                                              eos_label,
                                              num_phonemes,
                                              criterion.get(
                                                  'min_reward', -1.0),
                                              name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm and lm.get('path'):
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')