Example #1
0
def getProb(bestModel, dataset, probFilename, P):

    print "...getting probability"
    setX, setY, setName = dataset
    sharedSetX, sharedSetY, castSharedSetY = dnnUtils.sharedDataXY(setX, setY)

    idx = T.ivector('i')
    sX = T.matrix(dtype=theano.config.floatX)
    sY = T.ivector()

    # bulid best DNN model
    predicter = DNN( input = dnnUtils.splicedX(sX, idx, P.spliceWidth), P = P, params = bestModel )

    # Validation model
    Model = theano.function( inputs = [idx], outputs = predicter.p_y_given_x, 
                             givens={sX:sharedSetX, sY:castSharedSetY}, on_unused_input='ignore')
    
    # Center Index
    centerIdx = dnnUtils.findCenterIdxList(setY)

    # Total Center Index
    totalCenterIdxSize = len(centerIdx)
    
    # Make mini-Batch
    batchIdx = dnnUtils.makeBatch(totalCenterIdxSize, 16384)
    
    # Writing Probability
    dnnUtils.writeProb(Model, batchIdx, centerIdx, setName, probFilename)

    dnnUtils.clearSharedDataXY(sharedSetX, sharedSetY)
Example #2
0
 def __init__(self, vocab_size, dim, lr=0.5):
     W = np.asarray(np.random.rand(vocab_size, dim),
                    dtype=theano.config.floatX) / float(dim)
     W1 = np.asarray((np.random.rand(vocab_size, dim)),
                     dtype=theano.config.floatX) / float(dim)
     self.W = theano.shared(W, name='W', borrow=True)
     self.W1 = theano.shared(W1, name='W1', borrow=True)
     gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     gW1 = np.asarray(
         np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     self.gW = theano.shared(gW, name='gW', borrow=True)
     self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
     X = T.vector()
     fX = T.vector()
     ind_W = T.ivector()
     ind_W1 = T.ivector()
     w = self.W[ind_W, :]
     w1 = self.W1[ind_W1, :]
     cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
     grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
     updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
                                           grad[0] ** 2))]
     updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
                                            grad[1] ** 2))]
     updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
                                          - (lr / T.sqrt(self.gW[ind_W, :])) *
                                          grad[0]))]
     updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
                                           - (lr / T.sqrt(self.gW1[ind_W1, :])) *
                                           grad[1]))]
     updates = updates1 + updates2 + updates3 + updates4
     self.cost_fn = theano.function(
         inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
Example #3
0
    def test_CSMGrad(self):
        imshp = (3, 3)
        nkern = 1  # per output pixel
        kshp = (2, 2)
        # ssizes = ((1,1),(2,2))
        ssizes = ((1, 1),)
        # convmodes = ('full','valid',)
        convmodes = ("full",)

        kerns = tensor.dvector()
        indices = tensor.ivector()
        indptr = tensor.ivector()
        spmat_shape = tensor.ivector()

        for mode in ["FAST_COMPILE", "FAST_RUN"]:
            for conv_mode in convmodes:
                for ss in ssizes:
                    indvals, indptrvals, spshapevals, sptype, outshp, kmap = sp.convolution_indices.sparse_eval(
                        imshp, kshp, nkern, ss, conv_mode
                    )
                    kvals = numpy.random.random(nkern * numpy.prod(kshp) * numpy.prod(outshp)).flatten()

                    def d(kerns):
                        return theano.sparse.dense_from_sparse(
                            theano.sparse.CSM(sptype, kmap)(kerns, indvals, indptrvals, spshapevals)
                        )

                    # symbolic stuff
                    utt.verify_grad(d, [kvals])
Example #4
0
 def predict_next_batch(self, session_ids, input_item_ids, predict_for_item_ids=None, batch=100):
     '''
     Gives predicton scores for a selected set of items. Can be used in batch mode to predict for multiple independent events (i.e. events of different sessions) at once and thus speed up evaluation.
     
     If the session ID at a given coordinate of the session_ids parameter remains the same during subsequent calls of the function, the corresponding hidden state of the network will be kept intact (i.e. that's how one can predict an item to a session).
     If it changes, the hidden state of the network is reset to zeros.
             
     Parameters
     --------
     session_ids : 1D array
         Contains the session IDs of the events of the batch. Its length must equal to the prediction batch size (batch param).
     input_item_ids : 1D array
         Contains the item IDs of the events of the batch. Every item ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param).
     predict_for_item_ids : 1D array (optional)
         IDs of items for which the network should give prediction scores. Every ID must be in the training set. The default value is None, which means that the network gives prediction on its every output (i.e. for all items in the training set).
     batch : int
         Prediction batch size.
         
     Returns
     --------
     out : pandas.DataFrame
         Prediction scores for selected items for every event of the batch. 
         Columns: events of the batch; rows: items. Rows are indexed by the item IDs.
     
     '''
     if self.error_during_train: raise Exception
     if self.predict is None or self.predict_batch!=batch:
         X = T.ivector()
         Y = T.ivector()
         for i in range(len(self.layers)):
             self.H[i].set_value(np.zeros((batch,self.layers[i]), dtype=theano.config.floatX), borrow=True)
         if predict_for_item_ids is not None:
             H_new, yhat, _ = self.model(X, self.H, Y, 0)
         else:
             H_new, yhat = self.model_test(X, self.H)
         updatesH = OrderedDict()
         for i in range(len(self.H)):
             updatesH[self.H[i]] = H_new[i]
         if predict_for_item_ids is not None:
             self.predict = function(inputs=[X, Y], outputs=yhat, updates=updatesH, allow_input_downcast=True)
         else:
             self.predict = function(inputs=[X], outputs=yhat, updates=updatesH, allow_input_downcast=True)
         self.current_session = np.ones(batch) * -1
         self.predict_batch = batch
     session_change = np.arange(batch)[session_ids != self.current_session]
     if len(session_change) > 0:
         for i in range(len(self.H)):
             tmp = self.H[i].get_value(borrow=True)
             tmp[session_change] = 0
             self.H[i].set_value(tmp, borrow=True)
         self.current_session=session_ids.copy()
     in_idxs = self.itemidmap[input_item_ids]
     if predict_for_item_ids is not None:
         iIdxs = self.itemidmap[predict_for_item_ids]
         preds = np.asarray(self.predict(in_idxs, iIdxs)).T
         return pd.DataFrame(data=preds, index=predict_for_item_ids)
     else:
         in_idxs.values[np.isnan(in_idxs.values)] = 0 
         preds = np.asarray(self.predict(in_idxs)).T
         return pd.DataFrame(data=preds, index=self.itemidmap.index)
Example #5
0
def bsgd1(nn, data, name='sgd', lr=0.022, alpha=0.3, batch_size=500, epochs = 10):
	train_set_x, train_set_y = data[0]
	valid_set_x, valid_set_y = data[1]
	test_set_x, test_set_y = data[2]

	# valid_y_numpy = y_numpy[0]
	# test_y_numpy = y_numpy[1]
	test_y_numpy = map_48_to_39(test_y_numpy)
	valid_y_numpy = map_48_to_39(valid_y_numpy)
	print test_y_numpy

	num_samples = train_set_x.get_value(borrow=True).shape[0] 
	num_batches = num_samples / batch_size 

	layers = nn.layers
	x = T.matrix('x')
	y = T.ivector('y')
	y_eval = T.ivector('y_eval')

	cost = nn.cost(x, y)
	accuracy = nn.calcAccuracy(x, y)
	params = nn.params
	delta_params = nn.delta_params

	print theano.pp(cost)
	# theano.pp(accuracy)

	p_grads = [T.grad(cost=cost, wrt = p) for p in params]  
	# implementing gradient descent with momentum 
	print p_grads
	updates = OrderedDict()
	for dp, gp in zip(delta_params, p_grads):
		updates[dp] = dp*alpha - gp*lr
	for p, dp in zip(params, delta_params):
		updates[p] = p + updates[dp]

	# updates = [(p, p - lr*gp) for p, gp in zip(params, p_grads)]
	index = T.ivector('index')
	batch_sgd_train = theano.function(inputs=[index], outputs=[cost, accuracy], updates=updates, givens={x: train_set_x[index], y:train_set_y[index]})

	batch_sgd_valid = theano.function(inputs=[], outputs=[nn.calcAccuracy(x, y), nn.calcAccuracyTimit(x,y)], givens={x: valid_set_x, y:valid_set_y})

	batch_sgd_test = theano.function(inputs=[], outputs=nn.calcAccuracy(x, y), givens={x: test_set_x, y:test_set_y})

	indices = np.arange(num_samples,  dtype=np.dtype('int32'))
	np.random.shuffle(indices)

	for n in xrange(epochs):
		np.random.shuffle(indices)
		for i in xrange(num_batches):
			batch = indices[i*batch_size: (i+1)*batch_size]
			batch_sgd_train(batch)

		# y_np = y.get_value()
		# print y.eval()

		print "epoch:", n,  "	validation accuracy:",  batch_sgd_valid()


	print batch_sgd_test()
Example #6
0
def create_TrainFunc_tranPES(simfn, embeddings,  marge=0.5, alpha=1., beta=1.):

    # parse the embedding data
    embedding = embeddings[0] # D x N matrix
    lembedding = embeddings[1]

    # declare the symbolic variables for training triples
    hp = S.csr_matrix('head positive') # N x batchsize matrix
    rp = S.csr_matrix('relation')
    tp = S.csr_matrix('tail positive')

    hn = S.csr_matrix('head negative')
    tn = S.csr_matrix('tail negative')

    lemb = T.scalar('embedding learning rate')
    lremb = T.scalar('relation learning rate')

    subtensorE = T.ivector('batch entities set')
    subtensorR = T.ivector('batch link set')

    # Generate the training positive and negative triples
    hpmat = S.dot(embedding.E, hp).T #  batchsize x D dense matrix
    rpmat = S.dot(lembedding.E, rp).T
    tpmat = S.dot(embedding.E, tp).T

    hnmat = S.dot(embedding.E, hn).T
    tnmat = S.dot(embedding.E, tn).T

    # calculate the score
    pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat)


    negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat)
    negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat)

    costh, outh = margeCost(pos, negh, marge)
    costt, outt = margeCost(pos, negt, marge)

    embreg = regEmb(embedding, subtensorE, alpha)
    lembreg = regLink(lembedding, subtensorR, beta)
    

    cost = costh + costt + embreg[0] + lembreg
    out = T.concatenate([outh, outt])
    outc = embreg[1]

    # list of inputs to the function
    list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR]

    # updating the embeddings using gradient descend
    emb_grad = T.grad(cost, embedding.E)
    New_embedding = embedding.E - lemb*emb_grad

    remb_grad = T.grad(cost, lembedding.E)
    New_rembedding = lembedding.E - lremb * remb_grad

    updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding})

    return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg],
                          updates=updates, on_unused_input='ignore')
Example #7
0
def test_multMatVect():
    A1 = tensor.lmatrix('A1')
    s1 = tensor.ivector('s1')
    m1 = tensor.iscalar('m1')
    A2 = tensor.lmatrix('A2')
    s2 = tensor.ivector('s2')
    m2 = tensor.iscalar('m2')

    g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2)
    f0 = theano.function([A1, s1, m1, A2, s2, m2], g0)

    i32max = numpy.iinfo(numpy.int32).max

    A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s1 = numpy.random.randint(0, i32max, 3).astype('int32')
    m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
    A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s2 = numpy.random.randint(0, i32max, 3).astype('int32')
    m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")

    f0.input_storage[0].storage[0] = A1
    f0.input_storage[1].storage[0] = s1
    f0.input_storage[2].storage[0] = m1
    f0.input_storage[3].storage[0] = A2
    f0.input_storage[4].storage[0] = s2
    f0.input_storage[5].storage[0] = m2

    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
    f0.fn()
    r_b = f0.output_storage[0].value

    assert numpy.allclose(r_a1, r_b[:3])
    assert numpy.allclose(r_a2, r_b[3:])
Example #8
0
def directRNN():
    ####################### NumPy
    x0=0.5
    s=0.5
    times=[1,10,20,30,40,50]
    yhat=direct(x0, s, times)
    
    
    ############################### Symbolic
    x0_ = T.scalar("x0")
    c_= T.log((1-x0_)/x0_)
    times_ = T.ivector("times")
    S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S')
    yhat_= T.nnet.sigmoid(S__*times_/2-c_)
    Predict_ = theano.function(inputs=[x0_,times_], outputs=yhat_)
    
    
    ############################### Symbolic Recursive
    x0_ = T.scalar("x0")
    times_ = T.ivector("times")
    S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S')
#     predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1])
    predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: x_prev+(s*x_prev*(1-x_prev))/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1])
    pred_=predall_[times_-1] #we only have target at some generations e.g. 10,20,...
    Feedforward_ = theano.function(inputs=[x0_,times_], outputs=pred_, updates=updatesRecurrence_)

    ############################# Comparison
    x_0=0.5
    x_1=x_0+(s*x_0*(1-x_0))/(2*s*x_0+2)
    
    print '{:20s}{}'.format('NumPy', yhat)
    print '{:20s}{}'.format('Symbolic Direct', Predict_(x0,list(times)))
    print '{:20s}{}'.format('Symbolic Recursive', Feedforward_(x0,list(times)))
    print '{:20s}[ {} ]'.format('x_1', x_1)
Example #9
0
def multMatVect(v, A, m1, B, m2):
    """
    multiply the first half of v by A with a modulo of m1
    and the second half by B with a modulo of m2

    Note: The parameters of dot_modulo are passed implicitly because passing
    them explicitly takes more time then running the function's C-code.
    """
    if multMatVect.dot_modulo is None:
        A_sym = tensor.lmatrix("A")
        s_sym = tensor.ivector("s")
        m_sym = tensor.iscalar("m")
        A2_sym = tensor.lmatrix("A2")
        s2_sym = tensor.ivector("s2")
        m2_sym = tensor.iscalar("m2")
        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
        multMatVect.dot_modulo = function([A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)

    # This way of calling the Theano fct is done to bypass Theano overhead.
    f = multMatVect.dot_modulo
    f.input_storage[0].storage[0] = A
    f.input_storage[1].storage[0] = v[:3]
    f.input_storage[2].storage[0] = m1
    f.input_storage[3].storage[0] = B
    f.input_storage[4].storage[0] = v[3:]
    f.input_storage[5].storage[0] = m2
    f.fn()
    r = f.output_storage[0].storage[0]

    return r
Example #10
0
 def __init__(self, numpy_rng, theano_rng=None, y=None, 
              alpha=0.9, sample_rate=0.1, n_ins=784,
              hidden_layers_sizes=[500, 500], n_outs=10,
              corruption_levels=[0.1, 0.1],
              allX=None,allY=None,srng=None):
     self.sigmoid_layers = []
     self.sugar_layers = []
     self.params = []
     self.n_layers = len(hidden_layers_sizes)
     self.allXs = []
     if y == None:
         self.y = tensor.ivector(name='y')
     else:
         self.y = y
     assert self.n_layers > 0
     if not theano_rng:
         theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
     self.x = tensor.matrix('x')  
     self.x = tensor.matrix('x')  
     self.y = tensor.ivector('y')  
     self.y = tensor.ivector('y')  
     for i in xrange(self.n_layers):
         if i == 0:
             input_size = n_ins
         else:
             input_size = hidden_layers_sizes[i - 1]
         if i == 0:
             layer_input = self.x
         else:
             layer_input = self.sigmoid_layers[-1].output
         if i == 0:
             self.allXs.append(allX)
         else:
             self.allXs.append(tensor.dot(self.allXs[i-1], self.sigmoid_layers[-1].W) + self.sigmoid_layers[-1].b)
         sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                     input=layer_input,
                                     n_in=input_size,
                                     n_out=hidden_layers_sizes[i],
                                     activation=tensor.nnet.sigmoid)
         self.sigmoid_layers.append(sigmoid_layer)
         self.params.extend(sigmoid_layer.params)
         sugar_layer = sugar(numpy_rng=numpy_rng,
                             alpha=alpha,
                             sample_rate=sample_rate,
                             x=layer_input,
                             y=self.y,
                             n_visible=input_size,
                             n_hidden=hidden_layers_sizes[i],
                             W=sigmoid_layer.W,
                             bhid=sigmoid_layer.b,
                             allX=self.allXs[i],
                             allY=allY,
                             srng=srng)
         self.sugar_layers.append(sugar_layer)
     self.logLayer = LogisticRegression(
                      input=self.sigmoid_layers[-1].output,
                      n_in=hidden_layers_sizes[-1], n_out=n_outs)
     self.params.extend(self.logLayer.params)
     self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
     self.errors = self.logLayer.errors(self.y)
Example #11
0
    def __init__(self, dnodex,inputdim,dim):
        X=T.ivector()
	Y=T.ivector()
	Z=T.lscalar()
	eta = T.scalar()
        temperature=T.scalar()
        self.dnodex=dnodex
        num_input = inputdim
	dnodex.umatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.nuser,inputdim, inputdim))))
        dnodex.pmatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.npoi,inputdim))))
        dnodex.p_l2_norm=(dnodex.pmatrix**2).sum()
        dnodex.u_l2_norm=(dnodex.umatrix**2).sum()
        num_hidden = dim
        num_output = inputdim
        inputs = InputPLayer(dnodex.pmatrix[X,:], dnodex.umatrix[Z,:,:], name="inputs")
        lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1")
        lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2")
        lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3")
        softmax = SoftmaxPLayer(num_hidden, num_output, dnodex.umatrix[Z,:,:], input_layer=lstm3, name="yhat", temperature=temperature)

        Y_hat = softmax.output()

        self.layers = inputs, lstm1,lstm2,lstm3,softmax
        params = get_params(self.layers)
        #caches = make_caches(params)

	cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(dnodex.pmatrix[Y,:],dnodex.umatrix[Z,:,:])))+eta*dnodex.p_l2_norm+eta*dnodex.u_l2_norm
        updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta)

        self.train = theano.function([X,Y,Z, eta, temperature], cost, updates=updates, allow_input_downcast=True)

        predict_updates = one_step_updates(self.layers)
        self.predict_char = theano.function([X, Z, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
Example #12
0
    def __theano_build__(self):
        params = self.params
        param_names = self.param_names
        hidden_dim = self.hidden_dim

        x1  = T.imatrix('x1')    # first sentence
        x2  = T.imatrix('x2')    # second sentence
        x1_mask = T.fmatrix('x1_mask')    #mask
        x2_mask = T.fmatrix('x2_mask')
        y   = T.ivector('y')     # label
        y_c = T.ivector('y_c')   # class weights 
        
        # Embdding words
        _E1 = params["E"].dot(params["W"][0]) + params["B"][0]
        _E2 = params["E"].dot(params["W"][1]) + params["B"][1]
        statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim])
        statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim])
        
        def rnn_cell(x, mx, ph, Wh):
            h = T.tanh(ph.dot(Wh) + x)
            h = mx[:, None] * h + (1-mx[:, None]) * ph
            return [h] 
            
        [h1], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex1, x1_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))],
            non_sequences=params["W"][2])
        
        [h2], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex2, x2_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=h1[-1])],
            non_sequences=params["W"][3])
       
        #predict
        _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"])
        _p = T.argmax(_s, axis=1)
        _c = T.nnet.categorical_crossentropy(_s, y)
        _c = T.sum(_c * y_c)
        _l = T.sum(params["lrW"]**2)
        _cost = _c + 0.01 * _l
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # Gradients and updates
        _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay)
        
        # Assign functions
        self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads)
        self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c)
        self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s)
        self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p)
        self.sgd_step = theano.function(
            [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay],
            updates=_updates)
Example #13
0
def main(num_epochs=NUM_EPOCHS):
    print("Building network ...")
    # First, we build the network, starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, SEQ_LENGTH, num_inputs)
    
    #The network model
    
    l_in            = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH, num_inputs))
    l_forward_1     = lasagne.layers.LSTMLayer(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.tanh)
    l_forward_2     = lasagne.layers.LSTMLayer(l_forward_1, N_HIDDEN, grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.tanh)
    l_shp           = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN))
    l_dense         = lasagne.layers.DenseLayer(l_shp, num_units=num_inputs, lasagne.nonlinearity=linear)
    l_out           = lasagne.layers.ReshapeLayer(l_dense, (-1, SEQ_LENGTH, num_inputs))
    
    # create output out of input in order to save memory?
    network_output  = lasagne.layers.get_output(l_out)
    cost            = lasagne.objectives.squared_error(network_output,target_values).mean()
    all_params      = lasagne.layers.get_all_params(l_out,trainable=True)
    updates         = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)


    input_values    = T.ivector('target_output')
    target_values   = T.ivector('target_output')

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train           = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True)
    compute_cost    = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True)
 def __init__(self, input_params=None, sentenceLayerNodesNum=[150, 120], sentenceLayerNodesSize=[(2, 200), (3, 1)], negativeLambda=1, poolingSize=[(2, 1)], mode="max"):
     """
     mode is in {'max', 'average_inc_pad', 'average_exc_pad', 'sum'}
     """
     rng = numpy.random.RandomState(23455)
     self._corpusWithEmbeddings = T.matrix("wordIndeices")
     self._dialogSentenceCount = T.ivector("dialogSentenceCount")
     self._sentenceWordCount = T.ivector("sentenceWordCount")
     
     # for list-type data
     self._layer0 = layer0 = SentenceEmbeddingMultiNN(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200, \
                                                      sentenceLayerNodesNum=sentenceLayerNodesNum, \
                                                      sentenceLayerNodesSize=sentenceLayerNodesSize,
                                                      poolingSize=poolingSize,
                                                      mode=mode)
     
     layer1 = HiddenLayer(
         rng,
         input=layer0.output,
         n_in=layer0.outputDimension,
         n_out=layer0.outputDimension,
         activation=T.tanh
     )
     self._nextSentence = layer1.output
     self._params = layer1.params + layer0.params
     self._setParameters(input_params)
     self.negativeLambda = negativeLambda
     
     zero_count = 1
     for sentence, pooling in zip(sentenceLayerNodesSize[-1::-1], [(1, 1)] + poolingSize[-1::-1]): 
         zero_count *= pooling[0]
         zero_count += sentence[0] - 1 
     self.zero_count = zero_count - 1
Example #15
0
 def __theano_build__(self):
   U, V, W = self.U, self.V, self.W
   x = T.ivector('x')
   y = T.ivector('y')
   def forward_prop_step(x_t, s_t_prev, U, V, W):
     s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
     o_t = T.nnet.softmax(V.dot(s_t))
     return [o_t[0], s_t]
   [o,s], updates = theano.scan(
     forward_prop_step,
     sequences=x,
     outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
     non_sequences=[U, V, W],
     truncate_gradient=self.bptt_truncate,
     strict=True)
   prediction = T.argmax(o, axis=1)
   o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
   # Gradients
   dU = T.grad(o_error, U)
   dV = T.grad(o_error, V)
   dW = T.grad(o_error, W)      
   # Assign functions
   self.forward_propagation = theano.function([x], o)
   self.predict = theano.function([x], prediction)
   self.ce_error = theano.function([x, y], o_error)
   self.bptt = theano.function([x, y], [dU, dV, dW])
   # SGD
   learning_rate = T.scalar('learning_rate')
   self.sgd_step = theano.function([x,y,learning_rate], [], 
                   updates=[(self.U, self.U - learning_rate * dU),
                            (self.V, self.V - learning_rate * dV),
                            (self.W, self.W - learning_rate * dW)])
Example #16
0
 def build_finetune_functions(self, learning_rate):
     
     is_train = T.iscalar('is_train')
     X = T.matrix('X')
     AtRisk = T.ivector('AtRisk')
     Observed = T.ivector('Observed')
     #call the optimization function
     opt = Opt()
     forward = theano.function(
         on_unused_input='ignore',
         inputs=[X, Observed, AtRisk, is_train],
         outputs=[self.riskLayer.cost(self.o, self.AtRisk), self.riskLayer.output, self.riskLayer.input],
         givens={
             self.x: X,
             self.o: Observed,
             self.AtRisk: AtRisk,
             self.is_train:is_train
         },
         name='forward'
     )
     backward = theano.function(
         on_unused_input='ignore',
         inputs=[X, Observed, AtRisk, is_train],
         updates=opt.SGD(self.riskLayer.cost(self.o, self.AtRisk), self.params, learning_rate),
         outputs=T.grad(self.riskLayer.cost(self.o, self.AtRisk), self.params),
         givens={
             self.x: X,
             self.o: Observed,
             self.AtRisk: AtRisk,
             self.is_train:is_train
         },
         name='forward'
     )
     return forward, backward
Example #17
0
def test_unwrapper():
    emb_size = 5
    y_time = tt.ivector()
    y_seq_id = tt.ivector()
    x = tt.tensor3()

    emb = IdentityInput(x, size=5)

    sequn = SeqUnwrapper(20)
    sequn.connect(emb, y_time, y_seq_id)

    rng = np.random.RandomState(23455)
    conv = LeNetConvPoolLayer()
    conv.connect(sequn, rng, (3, 1, 5, emb_size), (1, 1, ))
    #prev_layer = conv

    f = theano.function([x, y_time, y_seq_id], conv.output())

    xx = np.random.randn(20, 4, emb_size)
    y_time = [3, 7, 10, 12]
    y_seq_id = [0, 0, 0, 0]
    res = f(xx, y_time, y_seq_id)
    print res.shape
    print res
    import ipdb; ipdb.set_trace()
Example #18
0
	def get_training_functions(self, x_lab_np=None, y_lab_np=None, x_unlab_np=None):
		# assert xlab.shape[0] == len(y_lab) 
		assert self.x_lab_np.shape[0] == len(y_lab)
		self.x_lab = self._shared_dataset(self.x_lab_np)
		self.y_lab = self._shared_dataset(self.y_lab_np)
		self.x_unlab = self._shared_dataset(self.x_unlab_np)
		self.alpha = float(xlab.shape[0] / xunlab.shape[0])
		index_unlab = T.ivector('index_unlab')
		index_lab = T.ivector('index_lab')
		momentum = T.scalar('momentum')
		learning_rate = T.scalar('learning_rate')
		# cost, updates = self.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab)

		self.batch_size_lab = self.batch_size * self.alpha
		self.batch_size_unlab = self.batch_size * (1-self.alpha)
		x_lab = T.matrix('x_lab')
		x_unlab = T.matrix('x_unlab')
		y_lab = T.ivector('y_lab')

		self.num_labels = self.x_lab_np.shape[0]
		self.num_unlabels = self.x_unlab_np[0]
		self.num_samples = num_labels + num_unlabels

		num_batches = num_samples / float(self.batch_size)
		pretraining_fns = []
		for i in xrange(len(hidden_layers)):
			ssda = self.layers[i]
			exit()
			cost, updates = ssda.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab)
			train_fn = theano.function(inputs=[index_lab, index_unlab], updates=updates, outputs=[cost], givens={self.x_lab:self.x_lab[index_lab], self.x_unlab:self.x_unlab[index_unlab], self.y_lab:self.y_lab[index_lab]})
			pretraining_fns.append(train_fn)

		return  pretraining_fns
Example #19
0
def EnergyVecFn(fnsim, embeddings, leftop, rightop):
    embedding, relationl, relationr = parse_embeddings(embeddings)
    idxl, idxo, idxr = T.ivector('idxl'), T.ivector('idxo'), T.ivector('idxr')
    lhs, rhs = embedding.E[:, idxl].T, embedding.E[:, idxr].T
    rell, relr = relationl.E[:, idxo].T, relationr.E[:, idxo].T
    energy = - fnsim(leftop(lhs, rell), rightop(rhs, relr))
    return theano.function([idxl, idxr, idxo], [energy], on_unused_input='ignore')
Example #20
0
File: GCWE.py Project: zerkh/BWE
	def train(self, word_emb):
		X_local = T.ivector(name="X_local")
		X = T.iscalar(name="X")
		X_neg = T.ivector(name="X_neg")
		X_g = T.dvector(name="X_g")
		
		[o_error], updates = theano.scan(self.target_function, sequences=X_neg,\
										non_sequences=[word_emb, X_local, X, X_g])
		
		error_sum = T.sum(o_error)
		self.c_error = theano.function([X_local, X, X_neg, X_g], error_sum)
		
		d_word_emb = T.grad(error_sum, word_emb)
		d_W1 = T.grad(error_sum, self.W1)
		d_b1 = T.grad(error_sum, self.b1)
		d_W2 = T.grad(error_sum, self.W2)
		d_b2 = T.grad(error_sum, self.b2)
		d_Wg1 = T.grad(error_sum, self.Wg1)
		d_bg1 = T.grad(error_sum, self.bg1)
		d_Wg2 = T.grad(error_sum, self.Wg2)
		d_bg2 = T.grad(error_sum, self.bg2)
		
		self.train_step = theano.function([X_local, X, X_neg, X_g], [], \
										updates=[(word_emb-d_word_emb)
												(self.W1-d_W1),
												(self.b1-d_b1),
												(self.W2-d_W2),
												(self.b2-d_b2),
												(self.Wg1-d_Wg1),
												(self.bg1-d_bg1),
												(self.Wg2-d_Wg2),
												(self.bg2-d_bg2)])
Example #21
0
def multMatVect(v, A, m1, B, m2):
    # TODO : need description for parameter and return
    """
    Multiply the first half of v by A with a modulo of m1 and the second half
    by B with a modulo of m2.

    Notes
    -----
    The parameters of dot_modulo are passed implicitly because passing them
    explicitly takes more time than running the function's C-code.

    """
    if multMatVect.dot_modulo is None:
        A_sym = tensor.lmatrix('A')
        s_sym = tensor.ivector('s')
        m_sym = tensor.iscalar('m')
        A2_sym = tensor.lmatrix('A2')
        s2_sym = tensor.ivector('s2')
        m2_sym = tensor.iscalar('m2')
        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
        multMatVect.dot_modulo = function(
            [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False)

    # This way of calling the Theano fct is done to bypass Theano overhead.
    f = multMatVect.dot_modulo
    f.input_storage[0].storage[0] = A
    f.input_storage[1].storage[0] = v[:3]
    f.input_storage[2].storage[0] = m1
    f.input_storage[3].storage[0] = B
    f.input_storage[4].storage[0] = v[3:]
    f.input_storage[5].storage[0] = m2
    f.fn()
    r = f.output_storage[0].storage[0]

    return r
Example #22
0
    def compile(self):
        '''
        Forward pass and Gradients
        '''
        # Get nicer names for parameters
        W1, W2, W3 = [self.W1] + self.params

        # FORWARD PASS
        # Embedding layer subspace
        self.z0    = T.ivector()                    # tweet in one hot

        # Use an intermediate sigmoid
        z1         = W1[:, self.z0]                 # embedding
        z2         = T.nnet.sigmoid(T.dot(W2, z1))  # subspace
        # Hidden layer
        z3         = T.dot(W3, z2)
        z4         = T.sum(z3, 1)                   # Bag of words
        self.hat_y = T.nnet.softmax(z4.T).T
        self.fwd   = theano.function([self.z0], self.hat_y)
        
        # TRAINING COST AND GRADIENTS
        # Train cost minus log probability
        self.y = T.ivector()                          # reference out
        self.F = -T.mean(T.log(self.hat_y)[self.y])   # For softmax out 
        # Update only last three parameters
        self.nablas = [] # Symbolic gradients
        self.grads  = [] # gradients
        for W in self.params:
            self.nablas.append(T.grad(self.F, W))
            self.grads.append(theano.function([self.z0, self.y], T.grad(self.F, W)))
        self.cost = theano.function([self.z0, self.y], self.F)
Example #23
0
    def initialize(self):
        users = T.ivector()
        items = T.ivector()
        ratings = T.vector()

        self.U = theano.shared(
            np.array(
                np.random.normal(scale=0.001, size=(self.n_users, self.n_factors)),
                dtype=theano.config.floatX
            )
        )
        self.I = theano.shared(
            np.array(
                np.random.normal(scale=0.001, size=(self.n_items, self.n_factors)),
                dtype=theano.config.floatX
            )
        )

        predictions = (self.U[users] * self.I[items]).sum(axis=1)

        train_error = (
            ((predictions - ratings) ** 2).mean() +
            self.regularization * (
                T.sum(self.U ** 2) +
                T.sum(self.I ** 2)
            )
        )
        test_error = ((predictions - ratings) ** 2).mean()

        params = [self.U, self.I]
        learning_rate = theano.shared(np.array(self.learning_rate, dtype=theano.config.floatX))
        updates = self.optimizer(train_error, params, learning_rate=learning_rate)
        self.train_theano = theano.function([users, items, ratings], train_error, updates=updates)
        self.test_theano = theano.function([users, items, ratings], test_error)
        self.predict_theano = theano.function([users, items], predictions)
Example #24
0
  def __init__(self, vocabulary_size, hidden_size, output_size):
    X = tensor.ivector()
    Y = tensor.ivector()
    keep_prob = tensor.fscalar()
    learning_rate = tensor.fscalar()

    emb_layer = Embedding(vocabulary_size, hidden_size)
    lstm_layer = BiLSTM(hidden_size, hidden_size)
    dropout_layer = Dropout(keep_prob)
    fc_layer = FullConnect(2*hidden_size, output_size)
    crf = CRF(output_size)
    # graph defination
    X_emb = emb_layer(X)
    scores = fc_layer(tensor.tanh(lstm_layer(dropout_layer(X_emb))))
    
    loss, predict = crf(scores, Y, isTraining=True)
    # loss, predict and accuracy
    accuracy = tensor.sum(tensor.eq(predict, Y)) * 1.0 / Y.shape[0]

    params = emb_layer.params + lstm_layer.params + fc_layer.params + crf.params
    updates = MomentumSGD(loss, params, lr=learning_rate)

    print("Compiling train function: ")
    train = theano.function(inputs=[X, Y, keep_prob, learning_rate], outputs=[predict, accuracy, loss], 
      updates=updates, allow_input_downcast=True)

    print("Compiling evaluate function: ")
    evaluate = theano.function(inputs=[X_emb, Y, keep_prob], outputs=[predict, accuracy, loss], 
      allow_input_downcast=True)

    self.embedding_tensor = emb_layer.params[0]
    self.train = train
    self.evaluate = evaluate
    self.params = params
Example #25
0
def set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag):
    print '\nBuilding a neural model: %s\n' % args.model

    """ neural architecture parameters """
    c_emb_dim = args.c_emb_dim
    w_hidden_dim = args.w_hidden_dim
    c_hidden_dim = args.c_hidden_dim
    output_dim = vocab_tag.size()
    window = args.window
    opt = args.opt

    """ symbol definition """
    x = T.ivector()
    c = T.ivector()
    b = T.ivector()
    y = T.ivector()
    lr = T.fscalar('lr')

    if args.model == 'char':
        return nn_char.Model(name=args.model, w=x, c=c, b=b, y=y, lr=lr,
                             init_w_emb=init_w_emb, vocab_w_size=vocab_word.size(), vocab_c_size=vocab_char.size(),
                             w_emb_dim=w_emb_dim, c_emb_dim=c_emb_dim, w_hidden_dim=w_hidden_dim,
                             c_hidden_dim=c_hidden_dim, output_dim=output_dim,
                             window=window, opt=opt)
    else:
        return nn_word.Model(name=args.model, x=x, y=y, lr=lr,
                             init_emb=init_w_emb, vocab_size=vocab_word.size(),
                             emb_dim=w_emb_dim, hidden_dim=w_hidden_dim, output_dim=output_dim,
                             window=window, opt=opt)
def create_iter_funcs_valid(l_out, bs=None, N=50, mc_dropout=False):
    X = T.tensor4('X')
    y = T.ivector('y')
    X_batch = T.tensor4('X_batch')
    y_batch = T.ivector('y_batch')

    if not mc_dropout:
        y_hat = layers.get_output(l_out, X, deterministic=True)
    else:
        if bs is None:
            raise ValueError('a fixed batch size is required for mc dropout')
        X_repeat = T.extra_ops.repeat(X, N, axis=0)
        y_sample = layers.get_output(
            l_out, X_repeat, deterministic=False)

        sizes = [X_repeat.shape[0] / X.shape[0]] * bs
        y_sample_split = T.as_tensor_variable(
            T.split(y_sample, sizes, bs, axis=0))
        y_hat = T.mean(y_sample_split, axis=1)

    valid_loss = T.mean(
        T.nnet.categorical_crossentropy(y_hat, y))
    valid_acc = T.mean(
        T.eq(y_hat.argmax(axis=1), y))

    valid_iter = theano.function(
        inputs=[theano.Param(X_batch), theano.Param(y_batch)],
        outputs=[valid_loss, valid_acc],
        givens={
            X: X_batch,
            y: y_batch,
        },
    )

    return valid_iter
def create_iter_funcs_train(l_out, lr, mntm, wd):
    X = T.tensor4('X')
    y = T.ivector('y')
    X_batch = T.tensor4('X_batch')
    y_batch = T.ivector('y_batch')

    y_hat = layers.get_output(l_out, X, deterministic=False)

    # softmax loss
    train_loss = T.mean(
        T.nnet.categorical_crossentropy(y_hat, y))

    # L2 regularization
    train_loss += wd * regularize_network_params(l_out, l2)

    train_acc = T.mean(
        T.eq(y_hat.argmax(axis=1), y))

    all_params = layers.get_all_params(l_out, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
        train_loss, all_params, lr, mntm)

    train_iter = theano.function(
        inputs=[theano.Param(X_batch), theano.Param(y_batch)],
        outputs=[train_loss, train_acc],
        updates=updates,
        givens={
            X: X_batch,
            y: y_batch,
        },
    )

    return train_iter
Example #28
0
	def evaluate_ready(self, ispro = True):
		var_x = T.ivector()
		var_y = T.ivector()

		print "adopt   mention  level  evaluate ????????????????????       "+str(self.ismention)
		if self.model_type == "softmax" or self.model_type == "softmax_reg":

			if self.istransition:
				output = self.structure1(var_x, ispro = False)
				self.evafunc = theano.function([var_x], output)

			else:
				output = self.structure1(var_x, ispro)
				self.evafunc = theano.function([var_x], output)

		
		elif self.model_type == "maxneg":
			out1, out2 = self.structure2(var_x,ispro)
			self.evafunc = theano.function([var_x], [out1,out2])

		elif self.model_type == "maxout":

			out1, out2 = self.structure2(var_x,False)
			self.evafunc = theano.function([var_x], [out1,out2])
		else: raise Exception
 def __init__(self, input_params=None):
     rng = numpy.random.RandomState(23455)
     self._corpusWithEmbeddings = T.matrix("wordIndeices")
     self._dialogSentenceCount = T.ivector("dialogSentenceCount")
     self._sentenceWordCount = T.ivector("sentenceWordCount")
     
     # for list-type data
     self._layer0 = SentenceEmbeddingNN(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200, \
                                                      sentenceLayerNodesNum=1000, \
                                                      sentenceLayerNodesSize=[5, 200])
     
     self._average_layer  = sentenceEmbeddingAverage(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200)
     
     # Get sentence layer W
     semanicTransformW = theano.shared(
         numpy.asarray(
             rng.uniform(low=-0.2, high=0.2, size=(self._layer0.outputDimension, 200)),
             dtype=config.globalFloatType()
         ),
         borrow=True
     )
     self._nextSentence = T.dot(self._layer0.output, semanicTransformW)
         
     # construct the parameter array.
     self._params = [semanicTransformW] + self._layer0.params
     self._setParameters(input_params)
Example #30
0
    def build(self):
        x=T.ivector('x')
        y=T.ivector('y')
        lr=T.scalar('learning_rate')

        def _recurrence(x_t,s_tm1):
            s_t=T.tanh(self.U[:,x_t]+T.dot(s_tm1,self.W))
            o_t=T.nnet.softmax(T.dot(s_t,self.V))
            return [o_t[0],s_t]

        [o,s],updates=theano.scan(fn=_recurrence,
                                  sequences=x,
                                  outputs_info=[None,dict(initial=T.zeros(self.hidden_dim))],
                                  truncate_gradient=self.bptt_truncate,
                                  strict=True)
        prediction=T.argmax(o,axis=1)
        o_error=T.sum(T.nnet.categorical_crossentropy(o,y))

        # Gradients
        gparams=T.grad(o_error,self.params)
        updates=[(param,param-lr*gparam) for param,gparam in zip(self.params,gparams)]


        # Assign functions
        self.forward_propagation=theano.function([x],o)
        self.predict=theano.function([x],prediction)
        self.train=theano.function(intputs=[x,y,lr],
                                   outputs=o_error,
                                   updates=updates)
Example #31
0
def main(model='mlp', num_epochs=500):
    # Load the dataset
    print("Loading data...")
    X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    if model == 'mlp':
        network = build_mlp(input_var)
    elif model.startswith('custom_mlp:'):
        depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',')
        network = build_custom_mlp(input_var, int(depth), int(width),
                                   float(drop_in), float(drop_hid))
    elif model == 'cnn':
        network = build_cnn(input_var)
    else:
        print("Unrecognized model type %r." % model)
        return

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss,
                                                params,
                                                learning_rate=0.01,
                                                momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
Example #32
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=None,
                 hidden_layers_sizes=[50], iBNhl = -1, n_outs=None):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels

        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        #activation=T.nnet.sigmoid)
                                        activation=T.tanh)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
        self.y_pred = self.logLayer.y_pred
        self.p_y = self.logLayer.p_y_given_x
        #print(len(self.sigmoid_layers))
        #for l in self.sigmoid_layers:
        #    print('0 ',l.output.shape)
        self.BN_f = self.sigmoid_layers[iBNhl].output
Example #33
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='../data/mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index + 1, n_train_batches,
                    this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of best'
                           ' model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
Example #34
0
    def train_cnn_model(self, X_train, y_train):
	print 'Training CNN model....'        
	print X_train.shape
	num_classes = len(set(y_train))

	#########  Build CNN  #########################################	
	#self.data_width = int(X_train.shape[2]/10) 
        #X_train = X_train.reshape(X_train.shape[0], 3, -1)
	print X_train.shape

	l_in = lasagne.layers.InputLayer(shape=(None, X_train.shape[1], X_train.shape[2]))

	conv_network = l_in

	#Build Convolution layers
	for l in range(self.params['conv_layers']):
		conv_network = lasagne.layers.Conv1DLayer(conv_network, num_filters=self.params['conv_filter_num'][l], filter_size=self.params['conv_filter_dims'], nonlinearity=lasagne.nonlinearities.rectify)

		print 'l_conv%d output: '%l+str(lasagne.layers.get_output_shape(conv_network))
	
		conv_network = lasagne.layers.MaxPool1DLayer(conv_network, pool_size=self.params['pool_size'])

		print 'l_pool%d output: '%l+str(lasagne.layers.get_output_shape(conv_network))
	
	conv_output = lasagne.layers.get_output(conv_network)
	
	network = conv_network
	#Build fully connected hidden layers
	for i in range(self.params['hid_layers']):
     		units = self.params['hid_units'][i]	
		network = lasagne.layers.DenseLayer(network, num_units=units, nonlinearity=lasagne.nonlinearities.tanh)	
		network = lasagne.layers.DropoutLayer(network, p=0.5)

	#Build output layer
	network = lasagne.layers.DenseLayer(network, num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax)

	
	input_var = T.tensor4('inputs')
	target_var = T.ivector('targets')
	
	predictions = lasagne.layers.get_output(network)
	conv_weights = lasagne.layers.get_output(conv_network)

	self.classifier = theano.function([l_in.input_var],predictions) 
	
	self.cnn_weights = theano.function([l_in.input_var], conv_output)	
	
	loss = lasagne.objectives.categorical_crossentropy(predictions, target_var)
	loss = loss.mean()
	
	params = lasagne.layers.get_all_params(network, trainable=True)
	updates = lasagne.updates.nesterov_momentum(loss,params, learning_rate=0.01, momentum=0.9)

	test_prediction = lasagne.layers.get_output(network, deterministic=True)
    	test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                            target_var)
    	test_loss = test_loss.mean()
    	# As a bonus, also create an expression for the classification accuracy:
    	test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)

    	# Compile a function performing a training step on a mini-batch (by giving
    	# the updates dictionary) and returning the corresponding training loss:
    	train_fn = theano.function([l_in.input_var, target_var], loss, updates=updates)

    	# Compile a second function computing the validation loss and accuracy:
    	self.val_fn = theano.function([l_in.input_var, target_var], [test_loss, test_acc])
	
	num_epochs = self.params['epochs']
	for epoch in range(num_epochs):
		start_time = time.time()
		train_err = 0
		train_batches = 0
		for batch in self.iterate_batches(X_train, y_train):
			inputs, targets = batch
			train_err += train_fn(inputs, targets.astype(np.int32))
			train_batches += 1

		#val_err = 0
		#val_acc = 0
		#val_batches = 0
		#for batch in self.iterate_batches(X_val, y_val):
			#inputs, targets = batch
		#err, acc = val_fn(X_val, y_val)
		#val_err += err
		#val_acc += acc
		#val_batches += 1

		# Then we print the results for this epoch:
		print("Epoch {} of {} took {:.3f}s".format(
		    epoch + 1, num_epochs, time.time() - start_time))
		print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
		#print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
		#print("  validation accuracy:\t\t{:.2f} %".format(
		    #val_acc / val_batches * 100))
	
	cnn_X_train = self.cnn_weights(X_train)
	cnn_X_train = cnn_X_train.reshape([cnn_X_train.shape[0], -1])
	self.svm = SVC()
	self.svm = self.svm.fit(cnn_X_train, y_train)
Example #35
0
def mlp_run(train_set,
            valid_set,
            test_set,
            learning_rate=0.01,
            L1_reg=0.00,
            L2_reg=0.0001,
            n_epochs=1000,
            batch_size=20,
            n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model
    """
    print 'loading ', train_set, ' for train'
    train_set_x, train_set_y = load_data(train_set)
    print 'loading ', valid_set, ' for valid'
    valid_set_x, valid_set_y = load_data(valid_set)
    print 'loading ', test_set, ' for test'
    if test_set != valid_set:
        test_set_x, test_set_y = load_data(test_set)
    else:
        test_set_x, test_set_y = valid_set_x, valid_set_y

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print "train_set_x size:", train_set_x.get_value(borrow=True).shape[0]
    print "batch_size:", batch_size
    print "n_train_batches:", n_train_batches
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    total_dim = train_set_x.get_value(borrow=True).shape[1]

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=total_dim,
                     n_hidden=n_hidden,
                     n_out=2)

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    # end-snippet-4

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_auc = theano.function(inputs=[],
                                   outputs=classifier.auc(y),
                                   givens={
                                       x: valid_set_x,
                                       y: valid_set_y
                                   })

    test_auc = theano.function(inputs=[],
                               outputs=classifier.auc(y),
                               givens={
                                   x: test_set_x,
                                   y: test_set_y
                               })

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = FLAGS.iter  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    print "n_train_batches:", n_train_batches
    print "validation_frequency:", validation_frequency

    best_validation_loss = numpy.inf
    best_auc = 0
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        if epoch == 10:
            learning_rate *= 0.8
        if epoch == 20:
            learning_rate *= 0.5
        if epoch == 30:
            learning_rate = 0.01

        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))
                auc_values = [validate_auc()]
                auc = numpy.mean(auc_values)
                print "current valid auc: ", auc, " best auc: ", best_auc, " imporve: ", auc - best_auc, " significant?: ", auc - best_auc > FLAGS.min_improvement
                #print validate_auc(0)

                if auc > best_auc:
                    if auc - best_auc > FLAGS.min_improvement:
                        print 'before patience:', patience, ' iter:', iter
                        patience = max(patience, iter * patience_increase)
                        print 'after patience:', patience
                    best_auc = auc
                    auc_values = [test_auc()]
                    testauc = numpy.mean(auc_values)
                    print "test auc: ", testauc
                    #cPickle.dump(classifier, open('best_model.pkl', 'wb'))

            if patience <= iter:
                done_looping = True
                print "patience:", patience, "iter:", iter, "done_looping:", done_looping
                break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print 'best valid auc is ', best_auc
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
Example #36
0
    def __init__(
            self,
            numpy_rng,
            theano_rng=None,
            cfg=None,  # the network configuration
            dnn_shared=None,
            shared_layers=[],
            input=None):

        self.layers = []
        self.params = []
        self.delta_params = []

        self.rnn_layerX = 2
        print "Use DRN"

        self.cfg = cfg
        self.n_ins = cfg.n_ins
        self.n_outs = cfg.n_outs
        self.hidden_layers_sizes = cfg.hidden_layers_sizes
        self.hidden_layers_number = len(self.hidden_layers_sizes)
        self.activation = cfg.activation

        self.do_maxout = cfg.do_maxout
        self.pool_size = cfg.pool_size

        self.max_col_norm = cfg.max_col_norm
        self.l1_reg = cfg.l1_reg
        self.l2_reg = cfg.l2_reg

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        # allocate symbolic variables for the data
        if input == None:
            self.x = T.matrix('x')
        else:
            self.x = input
        self.y = T.ivector('y')

        for i in xrange(self.hidden_layers_number):
            # construct the hidden layer
            if i == 0:
                input_size = self.n_ins
                layer_input = self.x
            else:
                input_size = self.hidden_layers_sizes[i - 1]
                layer_input = self.layers[-1].output

            W = None
            b = None
            if (i in shared_layers):
                W = dnn_shared.layers[i].W
                b = dnn_shared.layers[i].b
            if i == self.rnn_layerX:
                hidden_layer = RnnLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        W=W,
                                        b=b,
                                        activation=self.activation)
            else:
                if self.do_maxout == True:
                    hidden_layer = HiddenLayer(
                        rng=numpy_rng,
                        input=layer_input,
                        n_in=input_size,
                        n_out=self.hidden_layers_sizes[i] * self.pool_size,
                        W=W,
                        b=b,
                        activation=(lambda x: 1.0 * x),
                        do_maxout=True,
                        pool_size=self.pool_size)
                else:
                    hidden_layer = HiddenLayer(
                        rng=numpy_rng,
                        input=layer_input,
                        n_in=input_size,
                        n_out=self.hidden_layers_sizes[i],
                        W=W,
                        b=b,
                        activation=self.activation)
            # add the layer to our list of layers
            self.layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)
            self.delta_params.extend(hidden_layer.delta_params)
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(input=self.layers[-1].output,
                                           n_in=self.hidden_layers_sizes[-1],
                                           n_out=self.n_outs)

        if self.n_outs > 0:
            self.layers.append(self.logLayer)
            self.params.extend(self.logLayer.params)
            self.delta_params.extend(self.logLayer.delta_params)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

        if self.l1_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l1_reg * (abs(W).sum())

        if self.l2_reg is not None:
            for i in xrange(self.hidden_layers_number):
                W = self.layers[i].W
                self.finetune_cost += self.l2_reg * T.sqr(W).sum()
Example #37
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset=DataSet,
                    nkerns=[cls1, cls2], batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print type(train_set_x)  

    #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540])
    #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540])
    #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540])
    
    #train_set_x = train_set_x / 100
    #valid_set_x = valid_set_x / 100
    #test_set_x = test_set_x / 100
    

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0)
  
    print (n_test_batches)
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    Alr = T.scalar('Alr')
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ishape = (nFB, nFs)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    dFeatureV = 2*nFB*nFs
    xinp = x[:,:dFeatureV]
    
#    print (x.shahpe)
    
    layer0_input = xinp.reshape((batch_size, 2, nFB, nFs))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 2, nFB, nFs),
            filter_shape=(nkerns[0], 2, fsx, fsy), poolsize=(p1, 1))
    cl2x = (nFB - fsx + 1)/p1
    cl2y = (nFs - fsy + 1)
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], cl2x, cl2y),
            filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1))
    hl1 = (cl2x - fsx + 1)/p2
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[1] * hl1 * 1)+12,
                         n_out=nhu1, activation=T.tanh)

    layer22 = HiddenLayer(rng, input=layer2.output, n_in=nhu1,
                         n_out=nhu1, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer22.output, n_in=nhu1, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    #yPred = layer3.ypred(layer2.output)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], [layer3.errors(y), layer3.y_pred],
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})
    

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        #updates.append((param_i, param_i - learning_rate * grad_i))
        updates.append((param_i, param_i - Alr * grad_i))

    train_model = theano.function([index, Alr], cost, updates=updates,
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size][:],
            y: train_set_y[index * batch_size: (index + 1) * batch_size][:]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    #best_params = None
    best_params = []
    best_validation_loss = numpy.inf
    prev_validation_loss = 200

    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    Alrc = 0.1
    AlrE = 0.00001
    epochC = 0 
    epoch = 0
    done_looping = False
    for param in params:
        best_params.append(param.get_value())
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        epochC = epochC + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index, Alrc)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                lossratio = (this_validation_loss - prev_validation_loss)/(prev_validation_loss+1)
                print (lossratio)
                print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100., Alrc))

                # if we got the best validation score until now
                #if this_validation_loss < best_validation_loss:
                if lossratio <= 0.0:
                    for i in range(len(params)):
                        best_params[i] = params[i].get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    prev_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #tm =  test_model(0)
                   
                    yP = numpy.asarray([])
                    test_losses = [test_model(i)[0] for i in xrange(n_test_batches)]
                    for i in xrange(n_test_batches):
                        yP = numpy.concatenate((yP,test_model(i)[1]))
                    print yP.shape
                    test_score = numpy.mean(test_losses)
                    
                    #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value())
                    y = test_set_y.owner.inputs[0].get_value()[:3000]
                    
                    print (yP.shape)
                    print (y.shape)
                    I1 = numpy.nonzero(y==0.0)
                    I2 = numpy.nonzero(y==1.0)
                    I3 = numpy.nonzero(y==2.0)
                    I4 = numpy.nonzero(y==3.0)
                    print (I1[0].shape)
                    print (I2[0].shape)
                    print (I3[0].shape)
                    print (I4[0].shape)
                    I11 = numpy.nonzero(yP[I1[0]]==0)
                    I12 = numpy.nonzero(yP[I1[0]]==1)
                    I13 = numpy.nonzero(yP[I1[0]]==2)
                    I14 = numpy.nonzero(yP[I1[0]]==3)
                    I21 = numpy.nonzero(yP[I2[0]]==0)
                    I22 = numpy.nonzero(yP[I2[0]]==1)
                    I23 = numpy.nonzero(yP[I2[0]]==2)
                    I24 = numpy.nonzero(yP[I2[0]]==3)
                    I31 = numpy.nonzero(yP[I3[0]]==0)
                    I32 = numpy.nonzero(yP[I3[0]]==1)
                    I33 = numpy.nonzero(yP[I3[0]]==2)
                    I34 = numpy.nonzero(yP[I3[0]]==3)
                    I41 = numpy.nonzero(yP[I4[0]]==0)
                    I42 = numpy.nonzero(yP[I4[0]]==1)
                    I43 = numpy.nonzero(yP[I4[0]]==2)
                    I44 = numpy.nonzero(yP[I4[0]]==3)

                    acc1 = float(float(I11[0].size)/float(I1[0].size))
                    acc2 = float(float(I22[0].size)/float(I2[0].size))
                    if n_out == 3:
                    	acc3 = float(float(I33[0].size)/float(I3[0].size))
                    elif n_out == 4:
                    	acc3 = float(float(I33[0].size)/float(I3[0].size))
                        acc4 = float(float(I44[0].size)/float(I4[0].size))
                    else:
                        acc3 = 0
                        acc4 = 0
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size))

                    #print(('     epoch %i, minibatch %i/%i, test error of best '
                    #       'model %f %%') %
                    #      (epoch, minibatch_index + 1, n_train_batches,
                    #       test_score * 100.))
                else:
                    if Alrc <= AlrE:
                        done_looping = True
                        break
                    elif epochC > 40:
                        Alrc = Alrc/2
                        for param, best_param in zip(params,best_params):
                            param.set_value(best_param)
                        epochC = 0
            #if patience <= iter:
            #    done_looping = True
            #    break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    def fit(self,
            X,
            Y,
            learning_rate=10e-5,
            mu=0.9,
            decay=0.99,
            epochs=10,
            batch_sz=100,
            eps=10e-10,
            display_cost=False):
        #learning_rate=10e-7, mu=0.99, decay=0.999, epochs=100, batch_sz=30, l2=0.0, eps=10e-10
        learning_rate = np.float32(learning_rate)
        mu = np.float32(mu)
        decay = np.float32(decay)
        eps = np.float32(eps)
        '''
		In Theano we can't actually 'drop' the nodes;
		that would result in a different computational graph,
		we are instead to multiply nodes by 1 and 0;
		for each layer we then need to create a 'mask' - array of 0s and 1s;
		Theano graph nodes don't have values, so we can't multiply them by numpy array 'mask';
		instead we want Theano to generate random values every time it's called;
		thus we create an instance of RandomStreams object:
		'''
        self.rng = RandomStreams()

        # first, make a validation set:
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid, Yvalid = X[-1000:, :], Y[-1000:]
        X, Y = X[:-1000, :], Y[:-1000]

        #initialize the hidden layers:
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []

        # the size of the first dimension of the first matrix:
        M1 = D
        count = 0  # for the id of the weigts/biases
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2  # update the first dimension size fir the next iteration
            count += 1

        # for the last weight/bias matrix (vector):
        W, b = init_weight_and_bias(M1, K)
        self.W = theano.shared(W, 'W%s' % count)
        self.b = theano.shared(b, 'b%s' % count)

        # collect all the parameters we are going to use during Gradient Descent:
        self.parameters = [self.W, self.b]
        for h in self.hidden_layers[::-1]:
            self.parameters += h.params

        # in order to use Momentum,
        # we are to keep track of all the changes (dW's and db's):
        dparams = [
            theano.shared(np.zeros_like(p.get_value(), dtype=np.float32))
            for p in self.parameters
        ]

        # for RMSProp,
        # we are to keep track of caches (cache_W's and cache_b's) as well:
        caches = [
            theano.shared(np.ones_like(p.get_value(), dtype=np.float32))
            for p in self.parameters
        ]

        # define theano variables and functions:
        thX = T.matrix('X')
        thY = T.ivector('Y')  # a vector of integers

        # since we do dropout, we drop the nodes only on training step,
        # when evaluating we just scale them;
        # so we need to define two expressions for the output and cost calculations:
        pY_train = self.forward_train(thX)
        pY_predict = self.forward_predict(thX)

        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))

        prediction = self.predict(thX)  # will do sort of T.argmax(pY, axis=1)

        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        # the updates for the train function:

        updates = [
            (cache, decay * cache +
             (np.float32(1.0) - decay) * T.grad(cost, p)**2)
            for p, cache in zip(self.parameters, caches)
        ] + [(dp,
              mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(cache + eps))
             for dp, p, cache in zip(dparams, self.parameters, caches)
             ] + [(p, p + dp) for p, dp in zip(self.parameters, dparams)]

        #updates = rmsprop(cost, self.parameters, learning_rate, mu, decay, eps)

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # batch SGD:
        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :]
                Ybatch = Y[j * batch_sz:(j + 1) * batch_sz]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print('\ni: %d,  j: %d, cost: %.6f, \nerror: %.6f' %
                          (i, j, c, e))

        if display_cost:
            plt.plot(costs)
            plt.show()
Example #39
0
mini_updates = []
micro_updates = []
last_upd = []
update = []

# shared variables
learning_rate = shared(float32(lr.init))
if use.mom:
    momentum = shared(float32(mom.momentum))
    drop.p_vid = shared(float32(drop.p_vid_val))
    drop.p_hidden = shared(float32(drop.p_hidden_val))

idx_mini = T.lscalar(name="idx_mini")  # minibatch index
idx_micro = T.lscalar(name="idx_micro")  # microbatch index
x = ndtensor(len(tr.in_shape))(name='x')  # video input
y = T.ivector(name='y')  # labels
x_ = _shared(empty(tr.in_shape))
y_ = _shared(empty(tr.batch_size))
y_int32 = T.cast(y_, 'int32')

# in shape: #frames * gray/depth * body/hand * 4 maps
import cPickle
f = open(os.path.join(load_path, 'SK_normalization.pkl'), 'rb')
SK_normalization = cPickle.load(f)
Mean1 = SK_normalization['Mean1']
Std1 = SK_normalization['Std1']

f = open('CNN_normalization.pkl', 'rb')
CNN_normalization = cPickle.load(f)
Mean_CNN = CNN_normalization['Mean_CNN']
Std_CNN = CNN_normalization['Std_CNN']
Example #40
0
    def __init__(self, number_samples):

        # set up weights and biases
        d = 1  # depth of x
        n = number_samples

        # init
        Wx = np.asarray(
            rng.uniform(low=-np.sqrt(2. / (d + n_hidden)),
                        high=np.sqrt(2. / (d + n_hidden)),
                        size=(d, n_hidden)))
        self.Wx = theano.shared(Wx, name='Wx', borrow=True)

        Wh = np.asarray(
            rng.uniform(low=-np.sqrt(2. / (d + n_hidden)),
                        high=np.sqrt(2 / (d + n_hidden)),
                        size=(n_hidden, n_hidden)))
        self.Wh = theano.shared(Wh, name='Wh', borrow=True)

        bh = np.zeros(n_hidden)
        self.bh = theano.shared(bh, name='bh', borrow=True)

        ho = np.zeros(n_hidden)
        self.ho = theano.shared(ho, name='ho', borrow=True)

        Wo = np.asarray(
            rng.uniform(low=-np.sqrt(2. / (n_hidden + n_out)),
                        high=np.sqrt(2. / (n_hidden + n_out)),
                        size=(n_hidden, n_out)))
        self.Wo = theano.shared(Wo, name='Wo', borrow=True)

        bo = np.zeros(n_out)
        self.bo = theano.shared(bo, name='bo', borrow=True)

        # values to adjust with back propagation
        self.parameters = [
            self.Wx, self.Wh, self.bh, self.ho, self.Wo, self.bo
        ]

        # recurrence functions
        thX = T.fmatrix('x')
        thY = T.ivector('y')

        # feed forward equations
        def recurrence(x_t, h_t1):
            h_t = T.nnet.relu(
                T.dot(x_t, self.Wx) + T.dot(h_t1, self.Wh) + self.bh)
            y_t = T.nnet.softmax(T.dot(h_t, self.Wo) + self.bo)
            return h_t, y_t

        # loop over feed forward equations once for each bit in the sequence
        # send previous hidden output back through and collect prediction
        [h, y_predicted], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.ho, None],
            sequences=thX,
            n_steps=thX.shape[0],
        )

        # probability of x given y
        py_x = y_predicted[:, 0, :]
        prediction = T.argmax(py_x, axis=1)  # fetch most likely prediction

        # cost functions for gradients and tracking progress
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]),
                                  thY]))  # cross entropy
        gradients = T.grad(cost, self.parameters)  # derivatives

        updates = [(p, p - learning_rate * g)
                   for p, g in zip(self.parameters, gradients)]

        # training and prediction functions
        self.predict_op = theano.function(inputs=[thX], outputs=prediction)

        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=cost,
                                        updates=updates)
def build_model(shared_params, options):
    trng = RandomStreams(1234)
    drop_ratio = options['drop_ratio']
    batch_size = options['batch_size']
    n_dim = options['n_dim']

    w_emb = shared_params['w_emb']

    dropout = theano.shared(numpy.float32(0.))
    image_feat = T.ftensor3('image_feat')
    # batch_size x T
    input_idx = T.imatrix('input_idx')
    input_mask = T.matrix('input_mask')
    # label is the TRUE label
    label = T.ivector('label')

    empty_word = theano.shared(value=np.zeros((1, options['n_emb']),
                                              dtype='float32'),
                               name='empty_word')
    w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']],
                                 axis=0)
    input_emb = w_emb_extend[input_idx]

    # a trick here, set the maxpool_h/w to be large
    # maxpool_shape = (options['maxpool_h'], options['maxpool_w'])

    # turn those appending words into zeros
    # batch_size x T x n_emb
    input_emb = input_emb * input_mask[:, :, None]
    if options['sent_drop']:
        input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio)

    if options['use_unigram_conv']:
        unigram_conv_feat = fflayer(shared_params, input_emb, options,
                                    prefix='conv_unigram',
                                    act_func=options.get('sent_conv_act', 'tanh'))
        unigram_pool_feat = unigram_conv_feat.max(axis=1)
    if options['use_bigram_conv']:
        idx = T.concatenate([T.arange(input_emb.shape[1])[:-1],
                             T.arange(input_emb.shape[1])[1:]]).reshape((2, input_emb.shape[1] - 1)).transpose().flatten()
        bigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0],
                                                      input_emb.shape[1] - 1,
                                                      2 * input_emb.shape[2]))
        bigram_conv_feat = fflayer(shared_params, bigram_emb,
                                   options, prefix='conv_bigram',
                                   act_func=options.get('sent_conv_act', 'tanh'))
        bigram_pool_feat = bigram_conv_feat.max(axis=1)
    if options['use_trigram_conv']:
        idx = T.concatenate([T.arange(input_emb.shape[1])[:-2],
                             T.arange(input_emb.shape[1])[1:-1],
                             T.arange(input_emb.shape[1])[2:]]).reshape((3, input_emb.shape[1] - 2)).transpose().flatten()
        trigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0],
                                                      input_emb.shape[1] - 2,
                                                      3 * input_emb.shape[2]))
        trigram_conv_feat = fflayer(shared_params, trigram_emb,
                                    options, prefix='conv_trigram',
                                    act_func=options.get('sent_conv_act', 'tanh'))
        trigram_pool_feat = trigram_conv_feat.max(axis=1)  #

    pool_feat = T.concatenate([unigram_pool_feat,
                               bigram_pool_feat,
                               trigram_pool_feat], axis=1)

    image_feat_down = fflayer(shared_params, image_feat, options,
                              prefix='image_mlp',
                              act_func=options.get('image_mlp_act',
                                                   'tanh'))
    if options.get('use_before_attention_drop', False):
        image_feat_down = dropout_layer(image_feat_down, dropout, trng, drop_ratio)
        pool_feat = dropout_layer(pool_feat, dropout, trng, drop_ratio)

    # attention model begins here
    # first layer attention model
    image_feat_attention_1 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_1',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    pool_feat_attention_1 = fflayer(shared_params, pool_feat, options,
                                    prefix='sent_att_mlp_1',
                                    act_func=options.get('sent_att_mlp_act',
                                                         'tanh'))
    combined_feat_attention_1 = image_feat_attention_1 + \
                                pool_feat_attention_1[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_1 = dropout_layer(combined_feat_attention_1,
                                                  dropout, trng, drop_ratio)

    combined_feat_attention_1 = fflayer(shared_params,
                                        combined_feat_attention_1, options,
                                        prefix='combined_att_mlp_1',
                                        act_func=options.get(
                                            'combined_att_mlp_act',
                                            'tanh'))
    prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0])

    image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1)

    combined_hidden_1 = image_feat_ave_1 + pool_feat
    # second layer attention model

    image_feat_attention_2 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_2',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    pool_feat_attention_2 = fflayer(shared_params, combined_hidden_1, options,
                                    prefix='sent_att_mlp_2',
                                    act_func=options.get('sent_att_mlp_act',
                                                         'tanh'))
    combined_feat_attention_2 = image_feat_attention_2 + \
                                pool_feat_attention_2[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_2 = dropout_layer(combined_feat_attention_2,
                                                  dropout, trng, drop_ratio)

    combined_feat_attention_2 = fflayer(shared_params,
                                        combined_feat_attention_2, options,
                                        prefix='combined_att_mlp_2',
                                        act_func=options.get(
                                            'combined_att_mlp_act', 'tanh'))
    prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0])

    image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1)

    if options.get('use_final_image_feat_only', False):
        combined_hidden = image_feat_ave_2 + pool_feat
    else:
        combined_hidden = image_feat_ave_2 + combined_hidden_1


    for i in range(options['combined_num_mlp']):
        if options.get('combined_mlp_drop_%d'%(i), False):
            combined_hidden = dropout_layer(combined_hidden, dropout, trng,
                                            drop_ratio)
        if i == options['combined_num_mlp'] - 1:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func='linear')
        else:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func=options.get('combined_mlp_act_%d'%(i),
                                                           'tanh'))

    # drop the image output
    prob = T.nnet.softmax(combined_hidden)
    prob_y = prob[T.arange(prob.shape[0]), label]
    pred_label = T.argmax(prob, axis=1)
    # sum or mean?
    cost = -T.mean(T.log(prob_y))
    accu = T.mean(T.eq(pred_label, label))

    # return image_feat, input_idx, input_mask, \
        # label, dropout, cost, accu
    return image_feat, input_idx, input_mask, \
        label, dropout, cost, accu, pred_label, \
        prob_attention_1, prob_attention_2
Example #42
0
def test_mlp(learning_rate=0.01,
             L1_reg=0.00,
             L2_reg=0.0001,
             n_epochs=200,
             dataset='mnist.pkl.gz',
             batch_size=20,
             n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10)

    classifier.hiddenLayer.printWts()
    classifier.hiddenLayer2.printWts()

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sorted in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)

    classifier.hiddenLayer.printWts()
    classifier.hiddenLayer2.printWts()
Example #43
0
    def __init__(self,
                 num_emb,
                 emb_dim,
                 hidden_dim,
                 output_dim,
                 degree=2,
                 dep_types=3,
                 learning_rate=0.01,
                 momentum=0.9,
                 trainable_embeddings=True,
                 labels_on_nonroot_nodes=False,
                 eval_on_entities=True,
                 num_entities=2):
        assert emb_dim > 1 and hidden_dim > 1
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.degree = degree
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.num_entities = num_entities
        self.arc_type = dep_types

        self.params = []
        self.embeddings = theano.shared(
            self.init_matrix([self.num_emb, self.emb_dim]))
        if trainable_embeddings:
            self.params.append(self.embeddings)

        self.x = T.ivector(name='x')  # word indices
        if labels_on_nonroot_nodes:
            print 'matrix!!!'
            self.y = T.fmatrix(
                name='y')  # output shape [None, self.output_dim]
            self.y_exists = T.fvector(name='y_exists')  # shape [None]
        else:
            #print 'vector!!!'
            # Modifying this part too for the -log_prob loss
            print 'scalar!!!'
            self.y = T.iscalar(name='y')
            #self.y = T.fvector(name='y')  # output shape [self.output_dim]

        self.num_words = self.x.shape[
            0]  # total number of nodes (leaves + internal) in tree
        emb_x = self.embeddings[self.x]
        #emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x')  # zero-out non-existent embeddings

        if labels_on_nonroot_nodes:
            self.tree = T.imatrix(name='tree')  # shape [None, self.degree]
            self.tree_states = self.compute_tree(emb_x, self.tree)
            self.output_fn = self.create_output_fn_multi()
            self.pred_y = self.output_fn(self.tree_states)
            self.loss = self.loss_fn_multi(self.y, self.pred_y, self.y_exists)
        elif eval_on_entities:
            #self.tree = T.tensor3(name='tree')
            self.tree = T.matrix(name='tree')
            self.tree_states = self.compute_tree(emb_x, self.tree)
            self.output_fn = self.create_entity_output_fn()
            self.entities = [
                T.ivector(name='entt' + str(i))
                for i in range(self.num_entities)
            ]
            self.entity_tv = T.sum(self.tree_states[self.entities[0]], axis=0)
            for enidx in self.entities[1:]:
                self.entity_tv = T.concatenate(
                    [self.entity_tv,
                     T.sum(self.tree_states[enidx], axis=0)])
            self.pred_y = self.output_fn(self.entity_tv)
            self.loss = self.loss_fn(self.y, self.pred_y)
        else:
            self.tree = T.imatrix(name='tree')  # shape [None, self.degree]
            self.tree_states = self.compute_tree(emb_x, self.tree)
            self.final_state = self.tree_states[-1]
            self.output_fn = self.create_output_fn()
            self.pred_y = self.output_fn(self.final_state)
            self.loss = self.loss_fn(self.y, self.pred_y)

        self.tree_states = None
        updates = self.gradient_descent(self.loss)
        grads = T.grad(self.loss, self.params)

        train_inputs = [self.x, self.tree, self.y]
        pred_inputs = [self.x, self.tree]
        if labels_on_nonroot_nodes:
            train_inputs.append(self.y_exists)
        if eval_on_entities:
            train_inputs.extend(self.entities)
            pred_inputs.extend(self.entities)
        print 'train_inputs:', train_inputs
        print 'pred_inputs:', pred_inputs
        self._train = theano.function(train_inputs, [self.loss, self.pred_y],
                                      updates=updates)  #,
        #allow_input_downcast=True)
        self._predict = theano.function(pred_inputs, self.pred_y)  #,
Example #44
0
    def __init__(self,
                 n_dim,
                 n_out,
                 n_chan=1,
                 n_superbatch=12800,
                 opt_alg='adam',
                 opt_params={
                     'lr': 1e-3,
                     'b1': 0.9,
                     'b2': 0.99
                 }):
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))

        self.n_dim = n_dim
        self.n_out = n_out
        self.n_superbatch = n_superbatch
        self.alg = opt_alg
        self.n_class = 10

        lr = opt_params.get('lr')
        n_batch = opt_params.get('nb')

        train_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y_int = T.cast(train_set_y, 'int32')
        val_set_y_int = T.cast(val_set_y, 'int32')

        train_rbm_px_mu = theano.shared(
            np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX),
            borrow=False,
        )

        X = T.tensor4(dtype=theano.config.floatX)
        S = T.tensor3(dtype=theano.config.floatX)
        Y = T.ivector()
        px_mu = T.lscalar(dtype=config.floatX)
        idx1, idx2 = T.lscalar(), T.lscalar()
        alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
        self.inputs = (X, Y, idx1, idx2, S, px_mu)

        # ----------------------------
        # Begin RBM-only
        self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan)
        persistent_chain = theano.shared(
            np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX),
            borrow=True,
        )
        rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates(
            alpha,
            lr=lr,
            persistent=persistent_chain,
        )
        self.rbm_objectives = (rbm_cost, rbm_acc)
        self.rbm_train = theano.function(
            [idx1, idx2, alpha],
            [rbm_cost, rbm_acc],
            updates=rbm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2]
            },
            on_unused_input='warn',
        )
        # End RBM-only
        # ----------------------------
        # Begin DADGM-only
        tau = theano.shared(
            np.float32(5.0),
            name='temperature',
            allow_downcast=True,
            borrow=False,
        )
        self.tau = tau
        self.dadgm_network = self.create_dadgm_model(
            X,
            Y,
            n_dim,
            n_out,
            n_chan,
        )
        dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False)
        self.dadgm_objectives = (dadgm_loss, dadgm_acc)
        dadgm_params = self.get_dadgm_params()
        dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False)
        dadgm_updates = self.create_dadgm_updates(
            dadgm_grads,
            dadgm_params,
            alpha,
            opt_alg,
            opt_params,
        )
        self.dadgm_train = theano.function(
            [idx1, idx2, alpha],
            [dadgm_loss, dadgm_acc],
            updates=dadgm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2],
                px_mu: train_rbm_px_mu,
            },
            on_unused_input='warn',
        )
        self.dadgm_loss = theano.function(
            [X, Y],
            [dadgm_loss, dadgm_acc],
            on_unused_input='warn',
        )
        # End DADGM-only
        # ----------------------------
        self.n_batch = n_batch
        # parameters for sampling
        self.n_chain = 100

        # save data variables
        self.train_set_x = train_set_x
        self.train_set_y = train_set_y
        self.val_set_x = val_set_x
        self.val_set_y = val_set_y
        self.train_rbm_px_mu = train_rbm_px_mu
        self.data_loaded = False
Example #45
0
from lasagne.layers import InputLayer, DenseLayer
import lasagne
from lasagne.updates import sgd, total_norm_constraint
import theano.tensor as T

x = T.matrix()
y = T.ivector()
l_in = InputLayer((5, 10))
l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax)
output = lasagne.layers.get_output(l1, x)
cost = T.mean(T.nnet.categorical_crossentropy(output, y))
all_params = lasagne.layers.get_all_params(l1)
all_grads = T.grad(cost, all_params)
scaled_grads = total_norm_constraint(all_grads[i], 5)
updates = sgd(scaled_grads, all_params, learning_rate=0.1)
    def build_and_train(self,
                        X_train,
                        y_train,
                        X_val=None,
                        y_val=None,
                        display=False,
                        save_model=True,
                        aug_params=None):
        """
        Builds the model and runs the training loop.

        Parameters
        ----------
        X_train : numpy array
            Training data
        y_train : numpy array
            Training targets.
        X_val : numpy array, None, optional
            Validation data
        y_val : numpy array, None, optional
            Validation targets
        Display : bool, optional
            Display on-fly plots of training and validation results.
        Save_model : bool, optional
            Save model weights.
        aug_params : dict, None, optional
            Dict containing the data augmentation parameters.

        Returns
        -------
        Test function of the net.

        """
        # ======================================================================
        # Model compilation
        # ======================================================================

        print("Building model and compiling functions...")

        # Create Theano variables for input and target minibatch
        input_var = T.tensor4(
            'X', dtype=theano.config.floatX)  # shape (batchsize,3,224,224)
        target_var = T.ivector('y')  # shape (batchsize,)

        # Load model weights and metadata
        d = pickle.load(
            open(
                os.path.join(homedir, 'data', 'pretrained_weights',
                             'resnet50.pkl')))

        # Build the network and fill with pretrained weights except for the last fc layer
        net = build_model(input_var, self.output_dim)
        lasagne.layers.set_all_param_values(net['pool5'], d['values'][:-2])

        # create loss function and accuracy
        prediction = lasagne.layers.get_output(net['prob'])
        loss = lasagne.objectives.categorical_crossentropy(
            prediction, target_var)
        loss = loss.mean(
        ) + self.reg * lasagne.regularization.regularize_network_params(
            net['prob'], lasagne.regularization.l2)
        train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var),
                           dtype=theano.config.floatX)

        # Create parameter update expressions with fine tuning
        updates = {}
        for name, layer in net.items():
            layer_params = layer.get_params(trainable=True)
            if name == 'fc1000' or name == 'prob':
                layer_lr = self.lr
            else:
                layer_lr = self.lr * self.finetuning
            layer_updates = lasagne.updates.adam(loss,
                                                 layer_params,
                                                 learning_rate=layer_lr)
            updates.update(layer_updates)
        updates = collections.OrderedDict(updates)

        # Create a loss expression for validation/testing.
        test_prediction = lasagne.layers.get_output(net['prob'],
                                                    deterministic=True)
        test_loss = lasagne.objectives.categorical_crossentropy(
            test_prediction, target_var)
        test_loss = test_loss.mean()
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                          dtype=theano.config.floatX)

        # Compile training and validation functions
        train_fn = theano.function([input_var, target_var], [loss, train_acc],
                                   updates=updates)
        val_fn = theano.function([input_var, target_var],
                                 [test_loss, test_acc])
        test_fn = theano.function([input_var], test_prediction)

        # ======================================================================
        # Training routine
        # ======================================================================

        print("Starting training...")
        track = {
            'train_err': [],
            'train_acc': [],
            'val_err': [],
            'val_acc': []
        }

        if display:
            fig, (ax1, ax2) = plt.subplots(1, 2)
            line1, = ax1.plot([], [], 'r-')
            line2, = ax2.plot([], [], 'r-')

            ax1.set_xlabel('Epochs')
            ax1.set_ylabel('Training loss')
            ax1.set_yscale('log')
            ax1.set_title('Training loss')

            ax2.set_xlabel('Epochs')
            ax2.set_ylabel('Validation loss')
            ax2.set_yscale('log')
            ax2.set_title('Validation loss')

        # Batchsize and augmentation parameters
        if aug_params is None:
            aug_params = {}
        train_batchsize = min(len(y_train), self.batchsize)
        train_aug_params = aug_params.copy()
        train_aug_params.update({'mode': 'standard'})
        if X_val is not None:
            val_batchsize = min(len(y_val), self.batchsize)
            val_aug_params = aug_params.copy()
            val_aug_params.update({'mode': 'minimal', 'tags': None})

        for epoch in range(self.num_epochs):

            start_time = time.time()

            # Learning rate schedule decay
            if epoch in self.lr_decay_schedule:
                self.lr.set_value(self.lr.get_value() * self.lr_decay)
                print('############# Leaning rate: {} ####################'
                      ).format(self.lr.get_value())

            # Full pass over training data
            train_err, train_batches = 0, 0
            for batch in iterate_minibatches(X_train,
                                             y_train,
                                             train_batchsize,
                                             shuffle=True,
                                             **train_aug_params):
                inputs, targets = batch[0], batch[1]
                tmp_train_err, tmp_train_acc = train_fn(inputs, targets)
                track['train_err'].append(tmp_train_err)
                track['train_acc'].append(tmp_train_acc)
                train_err += tmp_train_err
                train_batches += 1
                print 'Training epoch {} - {:.1f}% completed | Loss: {:.4f} ; Accuracy: {:.1f}%'.format(
                    epoch,
                    train_batches * self.batchsize * 100. / len(y_train),
                    float(tmp_train_err),
                    float(tmp_train_acc) * 100)
                if np.isnan(train_err):
                    print(
                        'Your net exploded, try decreasing the learning rate.')
                    return None

            # Full pass over the validation data (if any)
            if X_val is not None:
                val_err, val_batches = 0, 0
                for batch in iterate_minibatches(X_val,
                                                 y_val,
                                                 val_batchsize,
                                                 shuffle=False,
                                                 **val_aug_params):
                    inputs, targets = batch[0], batch[1]
                    tmp_val_err, tmp_val_acc = val_fn(inputs, targets)
                    track['val_err'].append(tmp_val_err)
                    track['val_acc'].append(tmp_val_acc)
                    val_err += tmp_val_err
                    val_batches += 1
                    print 'Validation epoch {} - {:.1f}% completed | Loss: {:.4f} ; Accuracy: {:.1f}%'.format(
                        epoch,
                        val_batches * self.batchsize * 100. / len(y_val),
                        float(tmp_val_err),
                        float(tmp_val_acc) * 100)

            # Print the results for this epoch
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, self.num_epochs,
                time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            if X_val is not None:
                print("  validation loss:\t\t{:.6f}".format(val_err /
                                                            val_batches))

            # Display training and validation accuracy in plot
            if display:

                line1.set_xdata(np.append(line1.get_xdata(), epoch))
                line1.set_ydata(
                    np.append(line1.get_ydata(), train_err / train_batches))
                ax1.relim(), ax1.autoscale_view()

                if X_val is not None:
                    line2.set_xdata(np.append(line2.get_xdata(), epoch))
                    line2.set_ydata(
                        np.append(line2.get_ydata(), val_err / val_batches))
                    ax2.relim(), ax2.autoscale_view()

                fig.canvas.draw()

        # Save training information and net parameters
        print("Saving the model parameters and training information ...")

        train_info = {
            'training_params': {
                'output_dim': self.output_dim,
                'lr_init': self.lr_init,
                'lr_decay': float(self.lr_decay),
                'lr_schedule': self.lr_decay_schedule.tolist(),
                'reg': self.reg,
                'num_epochs': self.num_epochs,
                'batchsize': self.batchsize,
                'finetuning': self.finetuning
            }
        }

        a = inspect.getargspec(data_augmentation)
        augmentation_params = dict(
            zip(a.args[-len(a.defaults):],
                a.defaults))  # default augmentation params
        augmentation_params.update(aug_params)  # update with user's choice
        for k, v in augmentation_params.items():
            if type(v) == np.ndarray:
                augmentation_params[k] = np.array(v).tolist()
        train_info.update({'augmentation_params': augmentation_params})

        for k, v in track.items():
            track[k] = np.array(v).tolist()
        train_info.update(track)

        if save_model:
            filename = 'resnet50_' + str(self.output_dim) + 'classes_' + str(
                self.num_epochs) + 'epochs'
            with open(
                    os.path.join(homedir, 'conus_classification',
                                 'training_info', filename + '.json'),
                    'w') as outfile:
                json.dump(train_info, outfile)
            np.savez(
                os.path.join(homedir, 'conus_classification',
                             'training_weights', filename + '.npz'),
                *lasagne.layers.get_all_param_values(net['prob']))

        return test_fn
Example #47
0
 def __init__(self, *args, **kwargs):
     self.k = TT.ivector('k')
     super(Classifier, self).__init__(*args, **kwargs)
     self.y = self.softmax(self.y)
Example #48
0
def single_layer_lstm(n_in, n_out):
    Wxb = theano.shared(np.random.randn(n_in, n_out), )
    Whb = theano.shared(np.random.randn(n_out, n_out), )
    bb = theano.shared(np.random.randn(n_out))

    Wxi = theano.shared(np.random.randn(n_in, n_out), )
    Whi = theano.shared(np.random.randn(n_out, n_out), )
    bi = theano.shared(np.random.randn(n_out))

    Wxf = theano.shared(np.random.randn(n_in, n_out), )
    Whf = theano.shared(np.random.randn(n_out, n_out), )
    bf = theano.shared(np.random.randn(n_out))

    Wxo = theano.shared(np.random.randn(n_in, n_out), )
    Who = theano.shared(np.random.randn(n_out, n_out), )
    bo = theano.shared(np.random.randn(n_out))

    Wo = theano.shared(np.random.randn(n_out, n_out))
    bout = theano.shared(np.random.randn(n_out))

    params = [Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout]

    def step(x,htm1,ctm1,Wxb,Whb,bb,\
             Wxi,Whi,bi,\
             Wxf,Whf,bf,\
             Wxo,Who,bo,Wo,bout):
        z = T.tanh(T.dot(x, Wxb) + T.dot(htm1, Whb) + bb)
        i = T.nnet.sigmoid(T.dot(x, Wxi) + T.dot(htm1, Whi) + bi)
        f = T.nnet.sigmoid(T.dot(x, Wxf) + T.dot(htm1, Whf) + bf)
        c = i * z + f * ctm1
        o = T.nnet.sigmoid(T.dot(x, Wxo) + T.dot(htm1, Who) + bo)
        h = o * T.tanh(c)
        y = T.dot(h, Wo) + bout
        return [h, c, y]

    X = T.matrix()
    h0 = T.vector()
    c0 = T.vector()
    yt = T.ivector()
    lr = T.scalar()
    mom = T.scalar()

    [h, c, y], _ = theano.scan(step,
                               sequences=X,
                               outputs_info=[h0, c0, None],
                               non_sequences=[
                                   Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf,
                                   Wxo, Who, bo, Wo, bout
                               ])

    yout = T.nnet.softmax(y)
    L2 = T.scalar()
    L2 = 0
    for param in params:
        L2 += (param**2).sum()

    L2 = 0.001 * L2

    def loss(y_pred, y_true):
        return -T.mean(T.log(y_pred)[T.arange(y_true.shape[0]), y_true])

    #oloss = loss(yout,yt)
    #cost = theano.function( [X,h0,c0,yt], oloss )
    funch = theano.function([X, h0, c0], c)
    funcy = theano.function([X, h0, c0], y)

    oloss = loss(yout, yt) + L2
    cost = loss(yout, yt)
    gparams = []
    for param in params:
        gparams.append(T.grad(oloss, param))

    # zip just concatenate two lists
    updates_t = {}

    for param in params:
        updates_t[param] = theano.shared(value=np.zeros(
            param.get_value(borrow=True).shape, dtype=theano.config.floatX),
                                         name='updates')

    updates = {}
    for param, gparam in zip(params, gparams):
        weight_update = updates_t[param]
        upd = mom * weight_update - lr * gparam
        updates[weight_update] = upd
        updates[param] = param + upd
    """
    for param, gparam in zip(params, gparams):
        #mparam = theano.shared(param.get_value()*0.)
        upd = -lr*gparam# + mom*mparam# - 0.01*param# + 
        #updates[mparam] = upd
        updates[param] = param + upd
    """
    """            
        weight_update = updates[param]
        upd = -lr * gparam - 0.01*param
        updates[weight_update] = upd
        updates[param] = param + upd
    """

    #gWxo = T.grad(oloss,Wxo)
    #fgradwxo = theano.function( [X,h0,c0,yt], gWxo )
    trainer = theano.function([X, h0, c0, yt, lr, mom], [cost],
                              updates=updates)
    return funcy, trainer
Example #49
0
File: cnn.py Project: ec1112/indiv
def trainConvNet(data_xy, inp_dim =10, n_epochs = 3, nkerns=[5, 10], batch_size=500, learning_rate=0.1):
	with open("metrics.txt", "a") as f:
		f.write("**********\n")
		f.write("Learning rate: {0}\n".format(learning_rate))
		train_x, train_y, test_x, test_y, valid_x, valid_y = data_xy

		n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
		n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size
		n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size
		print '...building the model'

		kern0_dim = 3
		kern1_dim = 2
		pool0_dim = 2
		pool1_dim = 1



		if inp_dim==20:
			kern0_dim = 3
			kern1_dim = 2
			pool0_dim = 2
			pool1_dim = 1

		if inp_dim==24:
			kern0_dim = 5
			kern1_dim = 3
			pool0_dim = 2
			pool1_dim = 1

		if inp_dim==30:
			kern0_dim = 7
			kern1_dim = 5
			pool0_dim = 2
			pool1_dim = 1
		




		index = T.lscalar()

		x = T.tensor4('x')
		y = T.ivector('y')
		rng = numpy.random.RandomState(23455)

		layer0_input = x.reshape((batch_size, THREE, inp_dim, inp_dim))

		layer0 = LeNetConvPoolLayer(
			rng, 
			input = layer0_input,
			image_shape=(batch_size, THREE, inp_dim, inp_dim),
			filter_shape=(nkerns[0], 3, kern0_dim, kern0_dim),
			poolsize=(pool0_dim, pool0_dim)
		)

		inp1_dim = (inp_dim-kern0_dim+1)/pool0_dim
		layer1 = LeNetConvPoolLayer(
			rng,
			input = layer0.output,
			image_shape=(batch_size, nkerns[0], inp1_dim, inp1_dim),
			filter_shape=(nkerns[1], nkerns[0], kern1_dim, kern1_dim),
			poolsize=(pool1_dim, pool1_dim)
		)

		layer2_input = layer1.output.flatten(2)

		inp2_dim = (inp1_dim-kern1_dim+1)/pool1_dim
		layer2 = HiddenLayer(
			rng,
			input=layer2_input,
			n_in=nkerns[1]*inp2_dim*inp2_dim,
			n_out=300,
			activation=T.tanh
		)

		layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10)
		cost = layer3.negative_log_likelihood(y)


		test_model = theano.function([index], layer3.errors(y), givens={
				x: test_x[index*batch_size: (index+1)*batch_size],
				y: test_y[index*batch_size: (index+1)*batch_size]
			})

		validate_model = theano.function([index], layer3.errors(y), givens={
				x: valid_x[index*batch_size: (index+1)*batch_size],
				y: valid_y[index*batch_size: (index+1)*batch_size]
			})

		params = layer3.params + layer2.params + layer1.params + layer0.params

		grads  = T.grad(cost, params)

		updates = [
			(param_i, param_i - learning_rate * grad_i)
			for param_i, grad_i in zip(params, grads)
		] 
	
		train_model = theano.function([index], cost, updates=updates, givens={
				x: train_x[index*batch_size: (index+1)*batch_size],
				y: train_y[index*batch_size: (index+1)*batch_size]
			})

		print 'training... '

		patience = 10000
		patience_increase = 2
		improvement_threshold = 0.995
		validation_frequency = min(n_train_batches, patience / 2)
		best_validation_loss = numpy.inf
		best_iter = 0
		test_score = 0.
		start_time = timeit.default_timer()

		epoch = 0
		done_looping = False

		
		while (epoch < n_epochs) and (not done_looping):
			epoch = epoch + 1
			for minibatch_index in xrange(n_train_batches):
				iter = (epoch - 1) * n_train_batches + minibatch_index
				if iter % 100 == 0:
					print 'training @ iter = ', iter
				cost_ij = train_model(minibatch_index)

				if (iter + 1) % validation_frequency == 0:
					validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
					this_validation_loss = numpy.mean(validation_losses)
					print('epoch %i, minibatch %i/%i, validation error %f %%\n' %(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.))
					f.write("Epoch: {0}\n".format(epoch)) 
					f.write("Validation loss: {0}\n".format(this_validation_loss*100))
					f.write("Cost: {0}\n".format(cost_ij))
	                if this_validation_loss < best_validation_loss:

	                    if this_validation_loss < best_validation_loss *  \
	                       improvement_threshold:
	                        patience = max(patience, iter * patience_increase)

	                    best_validation_loss = this_validation_loss
	                    best_iter = iter

	                    test_losses = [
	                        test_model(i)
	                        for i in xrange(n_test_batches)
	                    ]
	                    test_score = numpy.mean(test_losses)
	                    print(('     epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') %(epoch, minibatch_index + 1, n_train_batches, test_score * 100.))
	          	if patience<=iter:
	          		done_looping=True
	          		break

		end_time = timeit.default_timer()
		print('Optimization complete.')
		print('Best validation score of %f %% obtained at iteration %i, '
	          'with test performance %f %%' %
	          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
		print >> sys.stderr, ('The code for file ' +
	                          os.path.split(__file__)[1] +
	                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

		print ('saving params for patch width: %i...' %(inp_dim))
		save_file = open('param'+str(inp_dim)+'.pkl', 'wb')
		W0 = layer0.params[0]; b0 = layer0.params[1]
		W1 = layer1.params[0]; b1 = layer1.params[1]
		cPickle.dump(W0.get_value(borrow=True), save_file, -1)
		cPickle.dump(b0.get_value(borrow=True), save_file, -1)
		cPickle.dump(W1.get_value(borrow=True), save_file, -1)
		cPickle.dump(b1.get_value(borrow=True), save_file, -1)
		save_file.close()
Example #50
0
import theano
from confusionmatrix import ConfusionMatrix
from lasagne.objectives import *
from lasagne.updates import *
import theano.tensor as T
from theano.tensor import *
from theano.tensor.signal import downsample
import lasagne
import numpy as np
import try_DP as DP
from theano.tensor import nnet
import lasagne.layers.dnn

dtensor5 = TensorType('float32', (False,)*5)
input_var = T.ftensor4('XY')
target_var = T.ivector('Y_train')
x1 = T.matrix('x1')
PS = 29

# Build Neural Network:
# Conv Net XY Plane
input = lasagne.layers.InputLayer((None, 1, PS, PS), input_var=input_var)

l_conv_1 = lasagne.layers.dnn.Conv2DDNNLayer(input, 20, (9,9))

l_maxpool_1 = lasagne.layers.dnn.Pool2DDNNLayer(l_conv_1, (3,3))

l_conv_2 = lasagne.layers.dnn.Conv2DDNNLayer(l_maxpool_1, 20,(5,5))

l_conv_3 = lasagne.layers.dnn.Conv2DDNNLayer(l_conv_2, 20, (3,3))
Example #51
0
def train_rep(learning_rate=0.002,
              L1_reg=0.0002,
              L2_reg=0.005,
              n_epochs=200,
              nkerns=[20, 50],
              batch_size=25):

    rng = numpy.random.RandomState(23455)

    train_dir = '../out/h5/'
    valid_dir = '../out/h5/'

    weights_dir = './weights/'

    print '... load input data'
    filename = train_dir + 'rep_train_data_1.gzip.h5'
    datasets = load_initial_data(filename)
    train_set_x, train_set_y, shared_train_set_y = datasets

    filename = valid_dir + 'rep_valid_data_1.gzip.h5'
    datasets = load_initial_data(filename)
    valid_set_x, valid_set_y, shared_valid_set_y = datasets

    mydatasets = load_initial_test_data()
    test_set_x, test_set_y, shared_test_set_y, valid_ds = mydatasets

    # compute number of minibatches for training, validation and testing
    n_all_train_batches = 30000
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_all_train_batches /= batch_size
    n_train_batches /= batch_size
    n_valid_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    # image size
    layer0_w = 50
    layer0_h = 50
    layer1_w = (layer0_w - 4) / 2
    layer1_h = (layer0_h - 4) / 2
    layer2_w = (layer1_w - 2) / 2
    layer2_h = (layer1_h - 2) / 2
    layer3_w = (layer2_w - 2) / 2
    layer3_h = (layer2_h - 2) / 2

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # image sizes
    batchsize = batch_size
    in_channels = 20
    in_width = 50
    in_height = 50
    #filter sizes
    flt_channels = 40
    flt_time = 20
    flt_width = 5
    flt_height = 5

    signals_shape = (batchsize, in_channels, in_height, in_width)
    filters_shape = (flt_channels, in_channels, flt_height, flt_width)

    layer0_input = x.reshape(signals_shape)

    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=signals_shape,
                                filter_shape=filters_shape,
                                poolsize=(2, 2))

    # TODO: incase of flt_time < in_time the output dimension will be different
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, flt_channels,
                                             layer1_w, layer1_h),
                                filter_shape=(60, flt_channels, 3, 3),
                                poolsize=(2, 2))

    layer2 = LeNetConvPoolLayer(rng,
                                input=layer1.output,
                                image_shape=(batch_size, 60, layer2_w,
                                             layer2_h),
                                filter_shape=(90, 60, 3, 3),
                                poolsize=(2, 2))
    layer3_input = layer2.output.flatten(2)

    layer3 = HiddenLayer(rng,
                         input=layer3_input,
                         n_in=90 * layer3_w * layer3_h,
                         n_out=500,
                         activation=T.tanh)

    layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=8)

    classify = theano.function(
        [index],
        outputs=layer4.get_output_labels(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    # symbolic Theano variable that represents the L1 regularization term
    L1 = T.sum(abs(layer4.params[0])) + T.sum(abs(layer3.params[0])) + T.sum(
        abs(layer2.params[0])) + T.sum(abs(layer1.params[0])) + T.sum(
            abs(layer0.params[0]))
    # symbolic Theano variable that represents the squared L2 term
    L2_sqr = T.sum(layer4.params[0]**2) + T.sum(layer3.params[0]**2) + T.sum(
        layer2.params[0]**2) + T.sum(layer1.params[0]**2) + T.sum(
            layer0.params[0]**2)
    # the loss
    cost = layer4.negative_log_likelihood(y) + L1_reg * L1 + L2_reg * L2_sqr

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    start_time = time.clock()

    epoch = 0
    done_looping = False
    cost_ij = 0
    train_files_num = 600
    val_files_num = 100

    startc = time.clock()
    while (epoch < n_epochs) and (not done_looping):
        endc = time.clock()
        print('epoch %i, took %.2f minutes' % \
                                  (epoch, (endc - startc) / 60.))
        startc = time.clock()
        epoch = epoch + 1
        for nTrainSet in xrange(1, train_files_num + 1):
            # load next train data
            if nTrainSet % 50 == 0:
                print 'training @ nTrainSet =  ', nTrainSet, ', cost = ', cost_ij
            filename = train_dir + 'rep_train_data_' + str(
                nTrainSet) + '.gzip.h5'
            datasets = load_next_data(filename)
            ns_train_set_x, ns_train_set_y = datasets
            train_set_x.set_value(ns_train_set_x, borrow=True)
            shared_train_set_y.set_value(numpy.asarray(
                ns_train_set_y, dtype=theano.config.floatX),
                                         borrow=True)
            n_train_batches = train_set_x.get_value(borrow=True).shape[0]
            n_train_batches /= batch_size

            # train
            for minibatch_index in xrange(n_train_batches):

                # training itself
                # --------------------------------------
                cost_ij = train_model(minibatch_index)
                # -------------------------

        # at the end of each epoch run validation
        this_validation_loss = 0
        for nValSet in xrange(1, val_files_num + 1):
            filename = valid_dir + 'rep_valid_data_' + str(
                nValSet) + '.gzip.h5'
            datasets = load_next_data(filename)
            ns_valid_set_x, ns_valid_set_y = datasets
            valid_set_x.set_value(ns_valid_set_x, borrow=True)
            shared_valid_set_y.set_value(numpy.asarray(
                ns_valid_set_y, dtype=theano.config.floatX),
                                         borrow=True)
            n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
            n_valid_batches /= batch_size

            # compute zero-one loss on validation set
            validation_losses = [
                validate_model(i) for i in xrange(n_valid_batches)
            ]
            this_validation_loss += numpy.mean(validation_losses)
        this_validation_loss /= (val_files_num)
        print('epoch %i, minibatch %i/%i, validation error %f %%' % \
              (epoch, minibatch_index + 1, n_train_batches, \
               this_validation_loss * 100.))

        # save snapshots
        print 'saving weights state, epoch = ', epoch
        f = file(weights_dir + 'weights_epoch' + str(epoch) + '.save', 'wb')
        state_L0 = layer0.__getstate__()
        cPickle.dump(state_L0, f, protocol=cPickle.HIGHEST_PROTOCOL)
        state_L1 = layer1.__getstate__()
        cPickle.dump(state_L1, f, protocol=cPickle.HIGHEST_PROTOCOL)
        state_L2 = layer2.__getstate__()
        cPickle.dump(state_L2, f, protocol=cPickle.HIGHEST_PROTOCOL)
        state_L3 = layer3.__getstate__()
        cPickle.dump(state_L3, f, protocol=cPickle.HIGHEST_PROTOCOL)
        state_L4 = layer4.__getstate__()
        cPickle.dump(state_L4, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

    end_time = time.clock()
    print('Optimization complete.')
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #52
0
    def __init__(self,
                 d_v,
                 d_e,
                 d_t,
                 optimizer,
                 optimizer_args,
                 np_rng,
                 th_rng,
                 n_classes=0,
                 encoder_layers=1,
                 generator_layers=0,
                 generator_transform=None,
                 use_interactions=False,
                 clip_gradients=False,
                 init_bias=None,
                 train_bias=False,
                 scale=6.0,
                 encode_labels=False,
                 l1_inter_factor=1.0,
                 time_penalty=False,
                 encoder_shortcut=False,
                 generator_shortcut=False):

        self.d_v = d_v  # vocabulary size
        self.d_e = d_e  # dimensionality of encoder
        self.d_t = d_t  # number of topics
        self.n_classes = n_classes  # number of classes
        assert encoder_layers == 1 or encoder_layers == 2
        self.n_encoder_layers = encoder_layers
        assert generator_layers == 0 or generator_layers == 1 or generator_layers == 2 or generator_layers == 4
        self.n_generator_layers = generator_layers

        # set various options
        self.generator_transform = generator_transform  # transform to apply after the generator
        self.use_interactions = use_interactions  # use interactions between topics and labels
        self.encode_labels = encode_labels  # feed labels into the encoder
        self.l1_inter_factor = l1_inter_factor  # factor by which to multiply L1 penalty on interactions
        self.encoder_shortcut = encoder_shortcut
        self.generator_shortcut = generator_shortcut

        # create parameter matrices and biases
        self.W_encoder_1 = common_theano.init_param('W_encoder_1', (d_e, d_v),
                                                    np_rng,
                                                    scale=scale)
        self.b_encoder_1 = common_theano.init_param('b_encoder_1', (d_e, ),
                                                    np_rng,
                                                    scale=0.0)

        if n_classes > 1:
            self.W_encoder_label = common_theano.init_param('W_encoder_label',
                                                            (d_e, n_classes),
                                                            np_rng,
                                                            scale=scale)
        else:
            self.W_encoder_label = common_theano.init_param(
                'W_encoder_label', (d_e, n_classes),
                np_rng,
                values=np.zeros((d_e, n_classes), dtype=np.float32))

        self.W_encoder_2 = common_theano.init_param('W_encoder_2', (d_e, d_e),
                                                    np_rng,
                                                    scale=scale)
        self.b_encoder_2 = common_theano.init_param('b_encoder_2', (d_e, ),
                                                    np_rng,
                                                    scale=0.0)

        self.W_encoder_shortcut = common_theano.init_param(
            'W_encoder_shortcut', (d_e, d_v), np_rng, scale=scale)

        self.W_mu = common_theano.init_param('W_mu', (d_t, d_e),
                                             np_rng,
                                             scale=scale)
        self.b_mu = common_theano.init_param('b_mu', (d_t, ),
                                             np_rng,
                                             scale=0.0)

        self.W_sigma = common_theano.init_param('W_sigma', (d_t, d_e),
                                                np_rng,
                                                scale=scale,
                                                values=np.zeros((d_t, d_e)))
        self.b_sigma = common_theano.init_param('b_sigma', (d_t, ),
                                                np_rng,
                                                scale=0.0,
                                                values=np.array([-4] * d_t))

        self.W_generator_1 = common_theano.init_param('W_generator_1',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_1 = common_theano.init_param('b_generator_1', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_2 = common_theano.init_param('W_generator_2',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_2 = common_theano.init_param('b_generator_2', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_3 = common_theano.init_param('W_generator_3',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_3 = common_theano.init_param('b_generator_3', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_4 = common_theano.init_param('W_generator_4',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_4 = common_theano.init_param('b_generator_4', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_decoder = common_theano.init_param('W_decoder', (d_v, d_t),
                                                  np_rng,
                                                  scale=scale)
        self.b_decoder = common_theano.init_param('b_decoder', (d_v, ),
                                                  np_rng,
                                                  scale=0.0)

        self.W_decoder_label = common_theano.init_param('W_decoder_label',
                                                        (d_v, n_classes),
                                                        np_rng,
                                                        scale=scale)
        self.W_decoder_inter = common_theano.init_param('W_decoder_inter',
                                                        (d_v, d_t * n_classes),
                                                        np_rng,
                                                        scale=scale)

        # set the decoder bias to the background frequency
        if init_bias is not None:
            self.b_decoder = common_theano.init_param('b_decoder', (d_v, ),
                                                      np_rng,
                                                      values=init_bias)

        # create basic sets of parameters which we will use to tell the model what to update
        self.params = [
            self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu,
            self.W_sigma, self.b_sigma, self.W_decoder
        ]
        self.param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ),
                             (d_t, d_e), (d_t, ), (d_v, d_t)]

        self.encoder_params = [
            self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu,
            self.W_sigma, self.b_sigma
        ]
        self.encoder_param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ),
                                     (d_t, d_e), (d_t, )]

        self.generator_params = []
        self.generator_param_shapes = []

        # add additional parameters to sets, depending on configuration
        if train_bias:
            self.params.append(self.b_decoder)
            self.param_shapes.append((d_v, ))
            self.decoder_params = [self.W_decoder, self.b_decoder]
            self.decoder_param_shapes = [(d_v, d_t), (d_v, )]
        else:
            self.decoder_params = [self.W_decoder]
            self.decoder_param_shapes = [(d_v, d_t)]

        # add parameters for labels (covariates)
        if self.n_classes > 1:
            self.params.append(self.W_decoder_label)
            self.param_shapes.append((d_v, n_classes))
            self.decoder_params.extend([self.W_decoder_label])
            self.decoder_param_shapes.extend([(d_v, n_classes)])
            if use_interactions:
                self.params.append(self.W_decoder_inter)
                self.param_shapes.append((d_v, d_t * n_classes))
                self.decoder_params.extend([self.W_decoder_inter])
                self.decoder_param_shapes.extend([(d_v, d_t * n_classes)])
            if encode_labels:
                self.params.append(self.W_encoder_label)
                self.param_shapes.append((d_e, n_classes))
                self.encoder_params.extend([self.W_encoder_label])
                self.encoder_param_shapes.extend([(d_e, n_classes)])
        self.label_only_params = [self.W_decoder_label]
        self.label_only_param_shapes = [(d_v, n_classes)]

        # add encoder parameters depending on number of layers
        if self.n_encoder_layers > 1:
            self.params.extend([self.W_encoder_2, self.b_encoder_2])
            self.param_shapes.extend([(d_e, d_e), (d_e, )])
            self.encoder_params.extend([self.W_encoder_2, self.b_encoder_2])
            self.encoder_param_shapes.extend([(d_e, d_e), (d_e, )])
        if self.encoder_shortcut:
            self.params.extend([self.W_encoder_shortcut])
            self.param_shapes.extend([(d_e, d_v)])
            self.encoder_params.extend([self.W_encoder_shortcut])
            self.encoder_param_shapes.extend([(d_e, d_v)])

        # add generator parameters depending on number of layers
        if self.n_generator_layers > 0:
            self.params.extend([self.W_generator_1, self.b_generator_1])
            self.param_shapes.extend([(d_t, d_t), (d_t, )])
            self.generator_params.extend(
                [self.W_generator_1, self.b_generator_1])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, )])

        if self.n_generator_layers > 1:
            self.params.extend([self.W_generator_2, self.b_generator_2])
            self.param_shapes.extend([(d_t, d_t), (d_t, )])
            self.generator_params.extend(
                [self.W_generator_2, self.b_generator_2])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, )])

        if self.n_generator_layers > 2:
            self.params.extend([
                self.W_generator_3, self.b_generator_3, self.W_generator_4,
                self.b_generator_4
            ])
            self.param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t),
                                      (d_t, )])
            self.generator_params.extend([
                self.W_generator_3, self.b_generator_3, self.W_generator_4,
                self.b_generator_4
            ])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, ),
                                                (d_t, d_t), (d_t, )])

        # declare variables that will be given as inputs to functions to be declared below
        x = T.vector('x', dtype=theano.config.floatX
                     )  # normalized vector of counts for one item
        y = T.vector(
            'y', dtype=theano.config.floatX)  # vector of labels for one item
        indices = T.ivector(
            'x')  # vector of vocab indices (easier to evaluate log prob)
        lr = T.fscalar('lr')  # learning rate
        l1_strength = T.fscalar('l1_strength')  # l1_strength
        kl_strength = T.fscalar('kl_strength')  # l1_strength

        n_words = T.shape(indices)
        # the two variables below are just for debugging
        n_words_print = theano.printing.Print('n_words')(
            T.shape(indices)[0])  # for debugging
        x_sum = theano.printing.Print('x_sum')(T.sum(x))  # for debugging

        # encode one item to mean and variance vectors
        mu, log_sigma_sq = self.encoder(x, y)

        # take a random sample from the corresponding multivariate normal
        h = self.sampler(mu, log_sigma_sq, th_rng)

        # compute the KL divergence from the prior
        KLD = -0.5 * T.sum(1 + log_sigma_sq - T.square(mu) -
                           T.exp(log_sigma_sq))

        # generate a document representation of dimensionality == n_topics
        r = self.generator(h)

        # decode back into a distribution over the vocabulary
        p_x_given_h = self.decoder(r, y)

        # evaluate the likelihood
        nll_term = -T.sum(
            T.log(p_x_given_h[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # compute the loss
        loss = nll_term + KLD * kl_strength

        # add an L1 penalty to the decoder terms
        if time_penalty and n_classes > 1:
            penalty = common_theano.col_diff_L1(l1_strength,
                                                self.W_decoder_label,
                                                n_classes)
        else:
            penalty = common_theano.L1(l1_strength, self.W_decoder)
            if n_classes > 1:
                penalty += common_theano.L1(l1_strength, self.W_decoder_label)
                if use_interactions:
                    penalty += common_theano.L1(
                        l1_strength * self.l1_inter_factor,
                        self.W_decoder_inter)

        # declare some alternate function for decoding from the mean
        r_mu = self.generator(mu)
        p_x_given_x = self.decoder(r_mu, y)
        nll_term_mu = -T.sum(
            T.log(p_x_given_x[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # declare some alternate functions for pretraining from a fixed document representation (r)
        pretrain_r = T.vector('pretrain_r', dtype=theano.config.floatX)
        p_x_given_pretrain_h = self.decoder(pretrain_r, y)
        pretrain_loss = -T.sum(
            T.log(p_x_given_pretrain_h[T.zeros(n_words, dtype='int32'),
                                       indices]) + 1e-32)

        # declare some alternate functions for only using labels
        p_x_given_y_only = self.decoder_label_only(y)
        nll_term_y_only = -T.sum(
            T.log(p_x_given_y_only[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # compute gradients
        gradients = [
            T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.params
        ]
        encoder_gradients = [
            T.cast(T.grad(loss, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.encoder_params
        ]
        generator_gradients = [
            T.cast(T.grad(loss, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX)
            for param in self.generator_params
        ]
        decoder_gradients = [
            T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.decoder_params
        ]
        pretrain_gradients = [
            T.cast(T.grad(pretrain_loss + penalty,
                          param,
                          disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.decoder_params
        ]
        label_only_gradients = [
            T.cast(T.grad(nll_term_y_only + penalty,
                          param,
                          disconnected_inputs='warn'),
                   dtype=theano.config.floatX)
            for param in self.label_only_params
        ]

        # optionally clip gradients
        if clip_gradients:
            gradients = common_theano.clip_gradients(gradients, 5)
            encoder_gradients = common_theano.clip_gradients(
                encoder_gradients, 5)
            generator_gradients = common_theano.clip_gradients(
                generator_gradients, 5)
            decoder_gradients = common_theano.clip_gradients(
                decoder_gradients, 5)
            pretrain_gradients = common_theano.clip_gradients(
                pretrain_gradients, 5)
            label_only_gradients = common_theano.clip_gradients(
                label_only_gradients, 5)

        # create the updates for various sets of parameters
        updates = optimizer(self.params, self.param_shapes, gradients, lr,
                            optimizer_args)
        encoder_updates = optimizer(self.encoder_params,
                                    self.encoder_param_shapes,
                                    encoder_gradients, lr, optimizer_args)
        generator_updates = optimizer(self.generator_params,
                                      self.generator_param_shapes,
                                      generator_gradients, lr, optimizer_args)
        decoder_updates = optimizer(self.decoder_params,
                                    self.decoder_param_shapes,
                                    decoder_gradients, lr, optimizer_args)
        other_updates = optimizer(
            self.encoder_params + self.generator_params,
            self.encoder_param_shapes + self.generator_param_shapes,
            encoder_gradients + generator_gradients, lr, optimizer_args)
        pretrain_updates = optimizer(self.decoder_params,
                                     self.decoder_param_shapes,
                                     pretrain_gradients, lr, optimizer_args)
        label_only_updates = optimizer(self.label_only_params,
                                       self.label_only_param_shapes,
                                       label_only_gradients, lr,
                                       optimizer_args)

        # declare the available methods for this class
        self.test_input = theano.function(inputs=[x, indices],
                                          outputs=[n_words_print, x_sum])
        self.train = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=updates,
            on_unused_input='ignore')
        self.train_encoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=encoder_updates,
            on_unused_input='ignore')
        self.train_generator = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=generator_updates,
            on_unused_input='ignore')
        self.train_decoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=decoder_updates,
            on_unused_input='ignore')
        self.train_not_decoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=other_updates,
            on_unused_input='ignore')
        self.pretrain_decoder = theano.function(
            inputs=[indices, y, pretrain_r, lr, l1_strength, kl_strength],
            outputs=[pretrain_loss],
            updates=pretrain_updates,
            on_unused_input='ignore')
        self.encode = theano.function(inputs=[x, y],
                                      outputs=[mu, log_sigma_sq],
                                      on_unused_input='ignore')
        self.decode = theano.function(inputs=[pretrain_r, y],
                                      outputs=[p_x_given_pretrain_h],
                                      on_unused_input='ignore')
        self.sample = theano.function(inputs=[x, y],
                                      outputs=h,
                                      on_unused_input='ignore')
        self.get_mean_doc_rep = theano.function(inputs=[x, y],
                                                outputs=r_mu,
                                                on_unused_input='ignore')
        self.encode_and_decode = theano.function(inputs=[x, y],
                                                 outputs=p_x_given_x,
                                                 on_unused_input='ignore')
        self.neg_log_likelihood = theano.function(inputs=[x, indices, y],
                                                  outputs=[nll_term, KLD],
                                                  on_unused_input='ignore')
        self.neg_log_likelihood_mu = theano.function(
            inputs=[x, indices, y],
            outputs=[nll_term_mu, KLD],
            on_unused_input='ignore')
        self.train_label_only = theano.function(
            inputs=[indices, y, lr, l1_strength],
            outputs=[nll_term_y_only, penalty],
            updates=label_only_updates)
        self.neg_log_likelihood_label_only = theano.function(
            inputs=[indices, y], outputs=nll_term_y_only)
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-2,
            mu=0.9,
            decay=0.9,
            epochs=10,
            batch_sz=100,
            show_fig=False):
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        updates = momentum_updates(cost, self.params, learning_rate, mu)

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #54
0
def train_conv_net(datasets,
                   U,
                   lr_decay=0.95,
                   img_w=300,
                   filter_hs=[3, 4, 5],
                   conv_non_linear="relu",
                   hidden_units=[100, 3],
                   shuffle_batch=True,
                   n_epochs=25,
                   sqr_norm_lim=9,
                   non_static=True,
                   batch_size=50,
                   activations=[Iden],
                   dropout_rate=[0.5]):
    """
    Train a simple conv net
    img_h = sentence length (padded where necessary)
    img_w = word vector length (300 for word2vec)
    filter_hs = filter window sizes    
    hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
    sqr_norm_lim = s^2 in the paper
    lr_decay = adadelta decay parameter
    """
    rng = np.random.RandomState(3435)
    img_h = len(datasets[0][0]) - 1
    filter_w = img_w
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
    parameters = [("image shape", img_h, img_w),
                  ("filter shape", filter_shapes),
                  ("hidden_units", hidden_units), ("dropout", dropout_rate),
                  ("batch_size", batch_size), ("non_static", non_static),
                  ("learn_decay", lr_decay),
                  ("conv_non_linear", conv_non_linear),
                  ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim),
                  ("shuffle_batch", shuffle_batch)]
    print parameters

    #define model architecture
    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    Words = theano.shared(value=U, name="Words")
    zero_vec_tensor = T.vector()
    zero_vec = np.zeros(img_w)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[
                                   (Words,
                                    T.set_subtensor(Words[0, :],
                                                    zero_vec_tensor))
                               ],
                               allow_input_downcast=True)
    layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0], 1, x.shape[1], Words.shape[1]))
    conv_layers = []
    layer1_inputs = []
    print 'starting loop'
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = LeNetConvPoolLayer(rng,
                                        input=layer0_input,
                                        image_shape=(batch_size, 1, img_h,
                                                     img_w),
                                        filter_shape=filter_shape,
                                        poolsize=pool_size,
                                        non_linear=conv_non_linear)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)
    layer1_input = T.concatenate(layer1_inputs, 1)
    hidden_units[0] = feature_maps * len(filter_hs)
    classifier = MLPDropout(rng,
                            input=layer1_input,
                            layer_sizes=hidden_units,
                            activations=activations,
                            dropout_rates=dropout_rate)

    print 'defining params'
    #define parameters of the model and update functions using adadelta
    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    if non_static:
        #if word vectors are allowed to change, add them as model parameters
        params += [Words]
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate
    #extra data (at random)
    np.random.seed(3435)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]
    new_data = np.random.permutation(new_data)
    n_batches = new_data.shape[0] / batch_size
    n_train_batches = int(np.round(n_batches * 0.9))
    #divide train set into train/val sets
    test_set_x = datasets[1][:, :img_h]
    test_set_y = np.asarray(datasets[1][:, -1], "int32")
    train_set = new_data[:n_train_batches * batch_size, :]
    val_set = new_data[n_train_batches * batch_size:, :]
    train_set_x, train_set_y = shared_dataset(
        (train_set[:, :img_h], train_set[:, -1]))
    val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1]))
    n_val_batches = n_batches - n_train_batches
    val_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size:(index + 1) * batch_size],
            y: val_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)

    #compile theano functions to get train/val/test errors
    test_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    train_model = theano.function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    test_pred_layers = []
    test_size = test_set_x.shape[0]
    test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape(
        (test_size, 1, img_h, Words.shape[1]))
    for conv_layer in conv_layers:
        test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
        test_pred_layers.append(test_layer0_output.flatten(2))
    test_layer1_input = T.concatenate(test_pred_layers, 1)
    test_y_pred = classifier.predict(test_layer1_input)

    test_error = T.mean(T.neq(test_y_pred, y))
    test_model_all = theano.function([x, y],
                                     test_error,
                                     allow_input_downcast=True)

    #start training over mini-batches

    print 'sizes: '
    print 'test: '
    print test_size
    print '... training'
    print 'n_train_batches: ' + str(n_train_batches)
    epoch = 0
    best_val_perf = 0
    val_perf = 0
    test_perf = 0
    cost_epoch = 0
    while (epoch < n_epochs):
        print 'epoch: ' + str(epoch)
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                if minibatch_index >= n_train_batches: minibatch_index -= 1
                print 'if: minibatch_index: ' + str(minibatch_index)
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                if minibatch_index >= n_train_batches: minibatch_index -= 1
                print 'else: minibatch_index: ' + str(minibatch_index)

                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        train_losses = [test_model(i) for i in xrange(n_train_batches)]
        train_perf = 1 - np.mean(train_losses)
        val_losses = [val_model(i) for i in xrange(n_val_batches)]
        val_perf = 1 - np.mean(val_losses)
        print(
            'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%'
            % (epoch, time.time() - start_time, train_perf * 100.,
               val_perf * 100.))
        if val_perf >= best_val_perf:
            best_val_perf = val_perf
            test_loss = test_model_all(test_set_x, test_set_y)
            test_perf = 1 - test_loss
    return test_perf
Example #55
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=183,
                 hidden_layers_sizes=[250, 250],
                 n_outs=1,
                 corruption_levels=[0.1, 0.1],
                 dropout_rate=0.1,
                 lambda1=0,
                 lambda2=0,
                 non_lin=None):
        """        
		:type numpy_rng: numpy.random.RandomState
		:param numpy_rng: numpy random number generator used to draw initial
					weights
		:type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
		:param theano_rng: Theano random generator; if None is given one is
						   generated based on a seed drawn from `rng`
		:type n_ins: int
		:param n_ins: dimension of the input to the Model

		:type hidden_layers_sizes: list of ints
		:param hidden_layers_sizes: sizes of intermediate layers. 

		:type n_outs: int
		:param n_outs:  dimension of the output of the network. Always 1 for a 
						regression problem.

		:type corruption_levels: list of float
		:param corruption_levels: amount of corruption to use for each layer

		:type dropout_rate: float
		:param dropout_rate: probability of dropping a hidden unit

		:type non_lin: function
		:param non_lin: nonlinear activation function used in all layers

		"""
        # Initializes parameters.
        self.hidden_layers = []
        self.dA_layers = []
        self.params = []
        self.dropout_masks = []
        self.n_layers = len(hidden_layers_sizes)
        self.L1 = 0
        self.L2_sqr = 0
        self.n_hidden = hidden_layers_sizes[0]
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # Allocates symbolic variables for the data.
        self.x = T.matrix('x', dtype='float32')
        self.o = T.ivector('o')
        self.at_risk = T.ivector('at_risk')
        self.is_train = T.iscalar('is_train')
        self.masks = [
            T.lmatrix('mask_' + str(i)) for i in range(self.n_layers)
        ]

        # Linear cox regression with no hidden layers.
        if self.n_layers == 0:
            self.risk_layer = RiskLayer(input=self.x,
                                        n_in=n_ins,
                                        n_out=n_outs,
                                        rng=numpy_rng)
        else:
            # Constructs the intermediate layers.
            for i in xrange(self.n_layers):
                if i == 0:
                    input_size = n_ins
                    layer_input = self.x
                else:
                    input_size = hidden_layers_sizes[i - 1]
                    layer_input = self.hidden_layers[-1].output

                if dropout_rate > 0:
                    hidden_layer = DropoutHiddenLayer(
                        rng=numpy_rng,
                        input=layer_input,
                        n_in=input_size,
                        n_out=hidden_layers_sizes[i],
                        activation=non_lin,
                        dropout_rate=dropout_rate,
                        is_train=self.is_train,
                        mask=self.masks[i])
                else:
                    hidden_layer = HiddenLayer(rng=numpy_rng,
                                               input=layer_input,
                                               n_in=input_size,
                                               n_out=hidden_layers_sizes[i],
                                               activation=non_lin)

                    # Adds the layer to the stack of layers.
                self.hidden_layers.append(hidden_layer)
                self.params.extend(hidden_layer.params)

                # Constructs an autoencoder that shares weights with this layer.
                dA_layer = dA(numpy_rng=numpy_rng,
                              theano_rng=theano_rng,
                              input=layer_input,
                              n_visible=input_size,
                              n_hidden=hidden_layers_sizes[i],
                              W=hidden_layer.W,
                              bhid=hidden_layer.b,
                              non_lin=non_lin)
                self.dA_layers.append(dA_layer)

        self.L1 += abs(hidden_layer.W).sum()
        self.L2_sqr += (hidden_layer.W**2).sum()

        # Adds a risk prediction layer on top of the stack.
        self.risk_layer = RiskLayer(input=self.hidden_layers[-1].output,
                                    n_in=hidden_layers_sizes[-1],
                                    n_out=n_outs,
                                    rng=numpy_rng)

        self.L1 += abs(self.risk_layer.W).sum()
        self.L2_sqr += (self.risk_layer.W**2).sum()
        self.params.extend(self.risk_layer.params)
        self.regularizers = lambda1 * self.L1 + lambda2 * self.L2_sqr
    def __init__(self,
                 batch_size,
                 kernels,
                 input_dimensions,
                 convolution_dimensions,
                 pool_sizes,
                 stride_sizes,
                 layer_pattern,
                 relu_pattern,
                 dropout_rate,
                 rng_seed=None,
                 base_learning_rate=0.05,
                 momentum=0.8,
                 learning_decay_per_epoch=0.91,
                 l2_norm=0,
                 name="default",
                 param_index=0,
                 address='',
                 n_epochs=200,
                 batch_normalization_pattern=None,
                 batch_norm_learning_rate=0.1,
                 batch_norm_decay_per_epoch=0.95,
                 batchnorm_vals_filename=None,
                 batchnorm_slide_percent=0.):
        """
        batch_size - int - size of each batch
        kernels - int array - number of general units each layer (incl. input/output)
        input_dimensions - int array[2] -  dimensions of input
        convolution_dimensions - int array[2] array - dimensions of each convolution
        pool_sizes - int array[2] array - dimensions of pooling for each convolution
        stride_sizes - int array - length of strides for each convolutional layer (this overrides aspects of pooling behavior)
        layer_pattern - ['I','C',...,'C','F',...,'F','O'] - indicates pattern of layers
        relu_pattern - boolean array that describes if convolutional layers should be rectified; doesn't do anything for other types of layers (including input)
        dropout_rate - float - rate of dropout for network weights
        rng_seed - int - seed for random number generator; None defaults to random
        base_learning_rate - floatX - initial learning rate
        momentum - floatX - amount that learning rate carries over through iterations
        learning_decay_per_epoch - floatX - factor for decreasing learning rate over epochs
        name - string that describes the beginning of the filenames of the network pickle
        param_index - integer determined a priori to index the param configurations and show it in the filename
        batchnorm_vals_filename - has to be constructed by separate file; pre-defines mean and sd of each layer for a nn...might be preferred to use sliding instead, as 
        batchnorm_slide_percent - sort of like momentum, but for calculations of batch-normalization means and standard deviations
        """
        #initialize arrays containing basic information and hyperparameters

        self.layers = []
        self.uses_batch_normalization = bool(batch_normalization_pattern)
        self.batch_norm_pattern = batch_normalization_pattern
        self.batchnorm_vals_filename = batchnorm_vals_filename
        self.batchnorm_slide_percent = batchnorm_slide_percent
        if not self.uses_batch_normalization:
            self.batch_norm_pattern = [False for _ in relu_pattern]
        self.address = address
        #replace future instances of self.kernel
        self.kernels = kernels
        self.input_dimensions = input_dimensions
        self.output_size = kernels[-1:][0]
        self.inputs = []
        self.batch_size = batch_size
        self.x = x = T.ftensor4('x')
        self.y = y = T.ivector('y')
        self.rng = np.random.RandomState(rng_seed)
        self.name = name
        self.n_epochs = n_epochs
        self.shapes = [(input_dimensions[0], input_dimensions[1])]
        print "input shape: " + str(self.shapes)
        self.convolution_dimensions = convolution_dimensions
        self.rng_seed = rng_seed
        self.layer_pattern = layer_pattern
        self.current_batch_index = 0
        self.batch_size = batch_size
        self.pool_sizes = pool_sizes
        self.stride_sizes = stride_sizes
        self.relu_pattern = relu_pattern
        #if the rate is a float, each layer has the same rate
        if type(dropout_rate) == type(1.1):
            dropout_rate = [dropout_rate for _ in layer_pattern]
        self.dropout_rate = dropout_rate

        self.learning_decay_per_epoch = learning_decay_per_epoch
        self.l2_norm = l2_norm
        #get some info from prepare_image_data.py
        #files_list, outputs, y_dim = prepare_image_data.get_data()
        #self.files_list = files_list
        #self.y_dim = y_dim
        #self.outputs=outputs
        self.fetcher = prepare_image_data.fetcher(self.batch_size)
        #indexing information
        self.ratios = np.asarray([0.6, 0.2, 0.2])
        self.index = index = T.lscalar()
        #temporarily hardcoded
        self.n_train_batches = 400
        self.n_valid_batches = 120
        self.n_test_batches = 120
        self.cat_labels = self.fetcher.valid_names
        self.y_dim = len(self.cat_labels)
        self.momentum = theano.shared(np.float32(momentum))
        self.base_learning_rate = np.float32(base_learning_rate)
        self.learning_rate = theano.shared(
            np.float32(base_learning_rate * (1 - momentum)))
        self.index = index = T.lscalar()
        self.momentum_raw = momentum
        self.learning_rate_raw = self.learning_rate.get_value()
        if self.uses_batch_normalization:
            self.batch_norm_learning_rate_raw = batch_norm_learning_rate
            self.batch_norm_learning_rate = theano.shared(
                np.float32(self.batch_norm_learning_rate_raw))
        self.epoch = 0
        #initialize basic file shapes
        #recent change: changed kernel_sizes to self.kernels
        self.training_x = theano.shared(np.zeros(
            shape=(batch_size, self.kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                        borrow=True)
        self.input = self.x.reshape((self.batch_size, self.kernels[0],
                                     self.shapes[0][0], self.shapes[0][1]))
        #updated database-based retrieval
        self.training_y = theano.shared(np.zeros(shape=self.batch_size,
                                                 dtype=np.int32),
                                        borrow=True)
        self.testing_x = theano.shared(np.zeros(
            shape=(self.batch_size, kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                       borrow=True)
        self.testing_y = theano.shared(np.zeros(shape=self.batch_size,
                                                dtype=np.int32),
                                       borrow=True)
        self.validation_x = theano.shared(np.zeros(
            shape=(self.batch_size, kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                          borrow=True)
        self.validation_y = theano.shared(np.zeros(shape=self.batch_size,
                                                   dtype=np.int32),
                                          borrow=True)
        #load fixed mean and sd values if file exists
        if self.batchnorm_vals_filename <> None:
            self.batchnorm_fixed_values = pickle.load(
                self.batchnorm_vals_filename)
        else:
            self.batchnorm_fixed_values = [
                None for _ in range(len(layer_pattern))
            ]
        ###begin creation of layers
        #I = "input";C = "Convolutional"; F = "Fully-Connected", O = "Output"
        for i, pattern in enumerate(layer_pattern):
            if pattern == "I":
                self.inputs.append(self.input)
                print 'inserted input'
            elif pattern == "C":

                self.layers.append(
                    NetConvPoolLayer(
                        self.rng,
                        input = self.inputs[i-1],
                        image_shape=(
                            batch_size,kernels[i-1],
                            self.shapes[i-1][0],
                            self.shapes[i-1][1]
                            ),
                        filter_shape=(
                            kernels[i],
                            kernels[i-1],
                            self.convolution_dimensions[i-1][0],
                            self.convolution_dimensions[i-1][1]),
                        poolsize = pool_sizes[i-1],
                        stride = stride_sizes[i-1],
                        dropout_percent = self.dropout_rate[i],
                        batch_norm = self.batch_norm_pattern[i],
                        batchnorm_slide_percent = self.batchnorm_slide_percent,
                        precalculated_batchnorm_values = self.\
                            batchnorm_fixed_values[i-1])
                    )
                x_new = (
                    self.shapes[i-1][0] - self.convolution_dimensions[i-1][0] + \
                        1 - (pool_sizes[i-1][0] - stride_sizes[i-1][0]))/\
                        (stride_sizes[i-1][0]
                         )
                y_new = (
                    self.shapes[i-1][1] - self.convolution_dimensions[i-1][1] + 1 -\
                        (pool_sizes[i-1][1] - stride_sizes[i-1][1]))/\
                        (stride_sizes[i-1][1]
                         )
                self.inputs.append(self.layers[i - 1].output)
                self.shapes.append((x_new, y_new))
                print "self.shapes: " + str(self.shapes)
                print 'added convolution layer'
            elif pattern == "F":
                if layer_pattern[i - 1] == "C":
                    next_input = self.inputs[i - 1].flatten(2)
                else:
                    next_input = self.inputs[i - 1]
                self.layers.append(
                    HiddenLayer(self.rng,
                                input=next_input,
                                n_in=kernels[i - 1] * self.shapes[i - 1][0] *
                                self.shapes[i - 1][1],
                                n_out=kernels[i],
                                activation=T.tanh,
                                dropout_rate=self.dropout_rate[i]))
                self.inputs.append(self.layers[i - 1].output)
                #the shape is only used to determine dimensions of the next layer
                self.shapes.append((1, 1))  #see if this fixes issue
                print 'added fully-connected hidden layer, shape=%s' %\
                    str(self.shapes[-1])
            else:
                if layer_pattern[i - 1] == "C":
                    next_input = self.inputs[i - 1].flatten(2)
                else:
                    next_input = self.inputs[i - 1]
                self.layers.append(
                    LogisticRegression(input=next_input,
                                       n_in=kernels[i - 1],
                                       n_out=self.output_size,
                                       rng=self.rng,
                                       dropout_rate=self.dropout_rate[i]))
                last_index = i - 1
                print 'added logistic layer'
        zero = np.float32(0.)
        self.L2_penalty = theano.shared(np.float32(l2_norm))
        self.params = params  = [param for layer in self.layers \
                                     for param in layer.params]
        self.cost = self.layers[last_index].negative_log_likelihood(self.y) +\
            self.L2_penalty * (
        T.sum([T.sum(self.layers[q].W * self.layers[q].W)\
                   for q in range(len(self.layers))]))
        #updating functions (incl. momentum)
        #update 1 (only used for derivation in update #4)
        self.old_updates = [theano.shared(zero * param_i.get_value())\
                                for param_i in params]
        self.current_delta = [theano.shared(np.float32(zero * param_i.get_value()))\
                                  for param_i in params]
        self.grads = T.grad(self.cost, params)
        #update 2
        self.current_change_update = [
            (current_delta_i, self.learning_rate * grad_i +\
                 self.momentum * old_updates_i)\
                for current_delta_i,grad_i, old_updates_i in\
                zip(self.current_delta,self.grads,self.old_updates)
            ]
        #update 3
        updates = [
            ( param_i,param_i - current_delta_i) for param_i, current_delta_i in\
                zip(params,self.current_delta)]
        #self.updates = []
        #update 4 (derived from update #1)
        momentum_updates = [(old_updates_i, current_delta_i)\
                                for old_updates_i, current_delta_i in\
                                zip(self.old_updates,self.current_delta)]
        #self.momentum_updates = []
        #now batch-normalization updates when needed
        batchnorm_sliding_updates = []
        for layer in self.layers:
            if not isinstance(layer, NetConvPoolLayer):
                continue
            if layer.batchnorm_slide_percent <> 0.:
                batchnorm_sliding_updates += [
                    (layer.sd_input_old, layer.sd_input),
                    (layer.means_old, layer.sd_input)
                ]
        #combined updates
        self.all_updates = self.current_change_update + updates +\
            momentum_updates + batchnorm_sliding_updates
        #test model function
        self.test_model = theano.function([],
                                          self.layers[last_index].errors(
                                              self.y),
                                          givens={
                                              x: self.testing_x,
                                              y: self.testing_y
                                          })
        #validation model function
        self.validate_model = theano.function([],
                                              self.layers[last_index].errors(
                                                  self.y),
                                              givens={
                                                  x: self.validation_x,
                                                  y: self.validation_y
                                              })
        #training function
        self.train_model = theano.function([],
                                           self.cost,
                                           updates=self.all_updates,
                                           givens={
                                               x: self.training_x,
                                               y: self.training_y
                                           })
        self.patience = 20000
        self.patience_increase = 3
        self.improvement_threshold = 0.995
        self.validation_frequency = min(self.n_train_batches,
                                        self.patience // 2)
        self.best_validation_loss = np.inf
        self.best_iter = 0
        #DEPRECATED
        self.itermode = 'train'
        self.test_score = 0.
        self.start_time = timeit.default_timer()
        self.epoch = 0
        self.iter_i = 0  # renamed bc `iter` is reserved
        self.done_looping = False
        self.param_index = param_index
        #constant-defined stuff
        self.improvement_threshold = 0.995
        self.validation_frequency = min(self.n_train_batches,
                                        self.patience // 2)
        self.done_looping = False
        print 'initialized neural network object'
Example #57
0
def solving_logistic_regression(datapath, learning_rate = 0.54,batch = 500,n_epoch = 30):
    ##for MNIST DATA LOADING PROCESS
    print "loading data...."
    mnist_data = upload_data(datapath)
    train, valid, test = mnist_data
    
    ##creating theano buffer for python data
    
    print 'moving to data to shared conversion'
    train_x, train_y = to_shared(train)
    valid_x, valid_y = to_shared(valid)
    test_x, test_y = to_shared(test)
    
    n_train_batch =  train[0].shape[0] // batch 
    n_valid_batch =  valid[0].shape[0] // batch
    n_test_batch  =  test[0].shape[0]  // batch
   
    
    x = T.matrix('x')
    y = T.ivector('y')
    index = T.iscalar('index')
       
    logistic = LogisticRegression(input = x,
                                  n_in = 784,
                                  n_out = 10)
    
    
    fun_valid = function(inputs  = [index],
                         outputs = logistic.error(y),
                         givens  = [(x,valid_x[index*batch:(index+1)*batch,:]),
                                    (y,valid_y[index*batch:(index+1)*batch])]
                        )   
       
    fun_test = function(inputs  = [index],
                        outputs = logistic.y_pred,
                        givens  = [(x,test_x[index*batch:(index+1)*batch,:])],
                       )    
        
    print "calaculating cost function"                
    cost = logistic.negative_log_likelihood(y) 
    
    g_W = T.grad(cost = cost,wrt = logistic.W)                                  
    g_b = T.grad(cost = cost,wrt = logistic.b)
                        
    updates = [(logistic.W, logistic.W - g_W*learning_rate),
               (logistic.b, logistic.b - g_b*learning_rate)]
               
    fun_train = function(inputs =[index],
                         outputs = logistic.params,
                         updates = updates,
                         givens = [(x,train_x[index*batch:(index+1)*batch,:]),
                                   (y,train_y[index*batch:(index+1)*batch])]
                         )
                  
                      

    ################
    #TRAINING MODEL#                      
    ################..........................................                     
    print 'training starts now -->'
    patience = 5000
    patience_increase = 2
    
    improvement = 0.96
    validation_frequency = min(n_train_batch, patience//2)    
  
    least_error = np.Inf
    epoch = 0
    done_looping = False
    
    print 'EPOCH counting .....'
    start_time = timeit.default_timer()
    while epoch < n_epoch and (not done_looping):
        for current_batch in range(n_train_batch):            
            total_batches = (epoch*n_train_batch) + current_batch
            fun_train(current_batch) 
            
            if (total_batches+1) % validation_frequency == 0:                
                this_error = [fun_valid(n) for n in range(n_valid_batch)]
                this_error = np.mean(this_error)
                
                if this_error < least_error*improvement:
                    least_error = this_error
                    patience =  max(patience,total_batches * patience_increase)
                    with open('/home/sameer/best_model.pkl', 'wb') as f:
                        pickle.dump(logistic, f)
                    
        if total_batches > patience:
            done_looping = True
        epoch += 1
        if total_batches != 0:
            print least_error
            print 'the convergence ratio is %f' %(patience/float(total_batches))
    
    end_time = timeit.default_timer()
    net_time = end_time - start_time
    print 'total time %f' %net_time
    print 'time per epoch %f' %(net_time/epoch)
    print 'the error is %f' %least_error
    print 'the total number of  epoch %d' %epoch    
import cPickle

f = gzip.open('C:/nnets/mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()

n_train, n_test = map(lambda x: len(x[0]), [train_set, test_set])
dims = train_set[0].shape[1]
n_classes = len(set(train_set[1]))

import numpy
import theano
import theano.tensor as T

X = T.dmatrix()
y = T.ivector()

prepare_data = lambda x: (theano.shared(x[0].astype('float64')),
                          theano.shared(x[1].astype('int32')))
(training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map(
    prepare_data, [train_set, test_set, valid_set])

W = theano.shared(numpy.zeros([dims, n_classes]))
b = theano.shared(numpy.zeros(n_classes))

y_hat = T.nnet.softmax(T.dot(X, W) + b)
y_pred = T.argmax(y_hat, axis=1)
test_error = T.mean(T.neq(y_pred, y))
training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y])

learning_rate = 0.2
Example #59
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-2,
            mu=0.99,
            reg=1e-12,
            epochs=400,
            batch_sz=20,
            print_period=1,
            show_fig=False):

        # X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = init_weight(M1, K)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # for momentum
        dparams = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # for rmsprop
        cache = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.predict(thX)
        grads = T.grad(cost, self.params)

        # momentum only
        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)

                if j % print_period == 0:
                    costs.append(c)
                    e = np.mean(Ybatch != p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #60
0
    def __init__(self,
                 embeddings,
                 height,
                 filter_hs,
                 conv_activation,
                 feature_maps,
                 output_units,
                 batch_size,
                 dropout_rates,
                 activations=[Iden]):
        """
        :param embeddings: word embeddings
        :param height: sentence length (padded as necessary)
        :param filter_hs: filter window sizes    
        :param conv_activation: activation functin for the convolutional layer
        :param feature_maps: the size of feature maps (per filter window)
        :param output_units: number of output variables
        """
        rng = np.random.RandomState(3435)
        self.batch_size = batch_size

        # define model architecture
        self.index = T.lscalar()  # minibatch number
        self.x = T.imatrix('x')  # a minibatch of words
        self.y = T.ivector('y')  # corresponding outputs

        width = embeddings.shape[1]

        self.emb_layer = EmbeddingLayer(embeddings, name='Words')

        # inputs to the ConvNet go to all convolutional filters:
        image_shape = (batch_size, 1, height, width)  # e.g. (50, 1, 66, 300)
        layer0_input = self.emb_layer.output(self.x).reshape(image_shape)
        #(self.x.shape[0], 1, self.x.shape[1], width))
        self.conv_layers = []
        # outputs of the convolutional filters
        layer1_inputs = []
        filter_w = width
        for filter_h in filter_hs:
            filter_shape = (feature_maps, 1, filter_h, filter_w
                            )  # e.g. (100, 1, 7, 300)
            pool_size = (height - filter_h + 1, 1)  # e.g. (60, 1)
            conv_layer = LeNetConvPoolLayer(rng,
                                            image_shape=image_shape,
                                            filter_shape=filter_shape,
                                            poolsize=pool_size,
                                            non_linear=conv_activation)
            layer1_input = conv_layer.output(layer0_input).flatten(2)
            self.conv_layers.append(conv_layer)
            layer1_inputs.append(layer1_input)
        # inputs to the MLP
        layer1_input = T.concatenate(layer1_inputs, 1)
        layer_sizes = [feature_maps * len(filter_hs), output_units]
        # initiailze MLPDropout
        MLPDropout.__init__(self,
                            rng,
                            input=layer1_input,
                            layer_sizes=layer_sizes,
                            activations=activations,
                            dropout_rates=dropout_rates)

        # add embeddings
        self.params += self.emb_layer.params
        # add parameters from convolutional layers
        for conv_layer in self.conv_layers:
            self.params += conv_layer.params