def build(self):
        
        # input and output variables
        x = T.matrix('x')
        y = T.matrix('y')
        index = T.lscalar() 
        batch_count = T.lscalar() 
        LR = T.scalar('LR', dtype=theano.config.floatX)
        M = T.scalar('M', dtype=theano.config.floatX)

        # before the build, you work with symbolic variables
        # after the build, you work with numeric variables
        
        self.train_batch = theano.function(inputs=[index,LR,M], updates=self.model.updates(x,y,LR,M),givens={ 
                x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 
                y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
                name = "train_batch", on_unused_input='warn')
        
        self.test_batch = theano.function(inputs=[index],outputs=self.model.errors(x,y),givens={
                x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size], 
                y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
                name = "test_batch")
                
        if self.format == "DFXP" :  
            self.update_range = theano.function(inputs=[batch_count],updates=self.model.range_updates(batch_count), name = "update_range")
    def train_function_momentum(self, x1, x2, i1, i2, l1, l2, y, z):
        """Train model with momentum"""

        learning_rate = T.scalar('lr')  # learning rate to use
        regularization = T.scalar('reg')  # regularization to use
        momentum = T.scalar('mom')  # momentum to use

        cost, updates = self.get_cost_updates_momentum(learning_rate, regularization, momentum)

        train_fn = theano.function(
            inputs=[
                theano.Param(learning_rate, default=0.1),
                theano.Param(regularization, default=0.0),
                theano.Param(momentum, default=0.9)
            ],
            outputs=cost,
            updates=updates,
            givens={
                self.x1: x1,
                self.x2: x2,
                self.indices1: i1,
                self.indices2: i2,
                self.l1: l1,
                self.l2: l2,
                self.y: y,
                self.z: z
            },
            name='train_momentum',
            on_unused_input='warn'
        )

        return train_fn
Example #3
0
    def get_train_fn(self, dataX, batch_size=1, k=1):
        """
        dataX: theano shared data

        dataY: theano shared label
        """
        learning_rate = T.scalar('lr')
        Beta = T.scalar('beta')
        Gamma = T.scalar('gamma')
        Sparseness = T.scalar('sparseness')

        cost, updates = self._get_cost_update(lr=learning_rate,
                                              beta=Beta,
                                              gamma=Gamma,
                                              s_constrain=Sparseness,
                                              k=k)

        index = T.lscalar('index')

        fn = theano.function(inputs=[index,
                                     theano.Param(learning_rate, default=0.01),
                                     theano.Param(Beta, default=0.1),
                                     theano.Param(Gamma, default=0.0001),
                                     theano.Param(Sparseness, default=0.05)],
                             outputs=cost,
                             updates=updates,
                             givens={self.x: dataX[index * batch_size:(index + 1) * batch_size]},
                             name='train_rbm_S_L2')
        return fn
Example #4
0
	def get_training_functions(self, x_lab_np=None, y_lab_np=None, x_unlab_np=None):
		# assert xlab.shape[0] == len(y_lab) 
		assert self.x_lab_np.shape[0] == len(y_lab)
		self.x_lab = self._shared_dataset(self.x_lab_np)
		self.y_lab = self._shared_dataset(self.y_lab_np)
		self.x_unlab = self._shared_dataset(self.x_unlab_np)
		self.alpha = float(xlab.shape[0] / xunlab.shape[0])
		index_unlab = T.ivector('index_unlab')
		index_lab = T.ivector('index_lab')
		momentum = T.scalar('momentum')
		learning_rate = T.scalar('learning_rate')
		# cost, updates = self.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab)

		self.batch_size_lab = self.batch_size * self.alpha
		self.batch_size_unlab = self.batch_size * (1-self.alpha)
		x_lab = T.matrix('x_lab')
		x_unlab = T.matrix('x_unlab')
		y_lab = T.ivector('y_lab')

		self.num_labels = self.x_lab_np.shape[0]
		self.num_unlabels = self.x_unlab_np[0]
		self.num_samples = num_labels + num_unlabels

		num_batches = num_samples / float(self.batch_size)
		pretraining_fns = []
		for i in xrange(len(hidden_layers)):
			ssda = self.layers[i]
			exit()
			cost, updates = ssda.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab)
			train_fn = theano.function(inputs=[index_lab, index_unlab], updates=updates, outputs=[cost], givens={self.x_lab:self.x_lab[index_lab], self.x_unlab:self.x_unlab[index_unlab], self.y_lab:self.y_lab[index_lab]})
			pretraining_fns.append(train_fn)

		return  pretraining_fns
Example #5
0
    def pretraining_functions(self, train_set_x, batch_size):

       
        index = T.lscalar('index') 
        corruption_level = T.scalar('corruption')  
        learning_rate = T.scalar('lr')  
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size
        
        pretrain_fns = []
        for dA in self.dA_layers:
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)
            fn = theano.function(
                inputs=[
                    index,
                    theano.In(corruption_level, value=0.1),
                    theano.In(learning_rate, value=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin: batch_end]
                }
            )
            
            pretrain_fns.append(fn)

        return pretrain_fns
Example #6
0
    def __init__(self, dnodex,dim):
        X = T.matrix()
        Y = T.matrix()

	eta = T.scalar()
        temperature=T.scalar()

        num_input = len(format(dnodex.npoi,'b'))
        num_hidden = dim
        num_output = len(format(dnodex.npoi,'b'))

        inputs = InputLayer(X, name="inputs")
        lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1")
        lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2")
        #lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3")
        softmax = SoftmaxLayer(num_hidden, num_output, input_layer=lstm2, name="yhat", temperature=temperature)

        Y_hat = softmax.output()

        self.layers = inputs, lstm1, lstm2, softmax

        params = get_params(self.layers)
        caches = make_caches(params)

        cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, Y))
        updates = momentum(cost, params, caches, eta)

        self.train = theano.function([X, Y, eta, temperature], cost, updates=updates, allow_input_downcast=True)

        predict_updates = one_step_updates(self.layers)
        self.predict_char = theano.function([X, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
Example #7
0
def _compile_func():
    beta = T.vector('beta')
    b = T.scalar('b')
    X = T.matrix('X')
    y = T.vector('y')
    C = T.scalar('C')
    params = [beta, b, X, y, C]
    cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum(
        T.nnet.softplus(
            -T.dot(T.diag(y), T.dot(X, beta) + b)
        )
    )
    # Function computing in one go the cost, its gradient
    # with regard to beta and with regard to the bias.
    cost_grad = theano.function(params,[
        cost,
        T.grad(cost, beta),
        T.grad(cost, b)
    ])

    # Function for computing element-wise sigmoid, used for
    # prediction.
    log_predict = theano.function(
        [beta, b, X],
        T.nnet.sigmoid(b + T.dot(X, beta)),
        on_unused_input='warn'
    )

    return (cost_grad, log_predict)
Example #8
0
    def build_model(self):
        ######################
        # BUILD ACTUAL MODEL #
        ######################
        logger.info('... building the model')

        U, W, V, bh, by = self.U, self.W, self.V, self.bh, self.by
        x = T.matrix('x')
        y = T.matrix('y')

        def forward_prop_step(x_t, s_tm1, U, W, bh):
            s_t = self.activation(T.dot(U, x_t) + T.dot(W, s_tm1) + bh)
            return s_t

        s, _ = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, W, bh],
            mode='DebugMode')

        p_y = T.nnet.softmax(T.dot(self.V, s[-1]) + by)
        prediction = T.argmax(p_y, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(p_y, y))
        self.cost = o_error + self.L1_reg * self.L1 + self.L2_reg * self.L2_sqr

        # Assign functions
        self.forward_propagation = theano.function([x], s[-1])
        self.predict = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], o_error)

        l_r = T.scalar('l_r', dtype=theano.config.floatX)   # learning rate (may change)
        mom = T.scalar('mom', dtype=theano.config.floatX)   # momentum
        self.bptt, self.f_update = self.Momentum(x, y, l_r, mom)
Example #9
0
    def build_pretraining_function(self, train_set_x, batch_size):
        
        index = T.lscalar('index')
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

        pretrain_fns = []

        for pretrain in self.pretrain_layers:

            cost, updates = pretrain.get_cost_updates(corruption_level, \
                                                            learning_rate)


            fn = theano.function(inputs=[index, corruption_level, \
                                                        learning_rate],
                    outputs=cost, 
                    updates=updates,
                    givens= {
                        self.x: train_set_x[index * batch_size: \
                                                    (index + 1) * batch_size]})

            pretrain_fns.append(fn)

        return pretrain_fns
Example #10
0
def more_complex_test():
    notimpl = NotImplementedOp()
    ifelseifelseif = IfElseIfElseIf()

    x1 = T.scalar('x1')
    x2 = T.scalar('x2')
    c1 = T.scalar('c1')
    c2 = T.scalar('c2')
    t1 = ifelse(c1, x1, notimpl(x2))
    t1.name = 't1'
    t2 = t1 * 10
    t2.name = 't2'
    t3 = ifelse(c2, t2, x1 + t1)
    t3.name = 't3'
    t4 = ifelseifelseif(T.eq(x1, x2), x1, T.eq(x1, 5), x2, c2, t3, t3 + 0.5)
    t4.name = 't4'

    f = function([c1, c2, x1, x2], t4, mode=Mode(linker='vm',
                                                 optimizer='fast_run'))
    if theano.config.vm.lazy is False:
        try:
            f(1, 0, numpy.array(10, dtype=x1.dtype), 0)
            assert False
        except NotImplementedOp.E:
            pass
    else:
        print(f(1, 0, numpy.array(10, dtype=x1.dtype), 0))
        assert f(1, 0, numpy.array(10, dtype=x1.dtype), 0) == 20.5
    print('... passed')
    def __form_input_tensor(self, name):

        left_entity = T.scalar(name='le_' + name, dtype='int32')
        right_entity = T.scalar(name='re_' + name, dtype='int32')
        relation = T.scalar(name='rel_' + name, dtype='int32')

        return T.stack([left_entity, right_entity, relation])
Example #12
0
    def build_train_fn(self,):
        self.lr_theano = T.scalar('lr')
        self.grad_inputs = self.inputs + [self.lr_theano]
        if self.momentum:
            self.mom_theano = T.scalar('mom')
            self.grad_inputs = self.grad_inputs + [self.mom_theano]
        
        self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant)
        if not self.momentum:
            print 'Building SGD optimization graph without momentum'
            updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams))
        else:
            print 'Building SGD optimization graph with momentum'
            updates = OrderedDict()
            for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams):
                param_inc = self.mom_theano * param_mom - self.lr_theano * gparam
                updates[param_mom] = param_inc
                updates[param] = param + param_inc
        self.calc_cost = theano.function(self.inputs,self.costs)
        if self.updates_old:
            updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case.
            self.updates_old.update(updates)
        else:
            self.updates_old = OrderedDict()
            self.updates_old.update(updates)

        self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old)
Example #13
0
def test_reallocation():
    x = tensor.scalar('x')
    y = tensor.scalar('y')
    z = tensor.tanh(3 * x + y) + tensor.cosh(x + 5 * y)
    # The functinality is currently implement for non lazy and non c VM only.
    for l in [vm.VM_Linker(allow_gc=False, lazy=False, use_cloop=False),
              vm.VM_Linker(allow_gc=True, lazy=False, use_cloop=False)]:
        m = theano.compile.get_mode(theano.Mode(linker=l))
        m = m.excluding('fusion', 'inplace')

        f = theano.function([x, y], z, name="test_reduce_memory",
                            mode=m)
        output = f(1, 2)
        assert output
        storage_map = f.fn.storage_map

        def check_storage(storage_map):
            from theano.tensor.var import TensorConstant
            for i in storage_map:
                if not isinstance(i, TensorConstant):
                    keys_copy = list(storage_map.keys())[:]
                    keys_copy.remove(i)
                    for o in keys_copy:
                        if (storage_map[i][0] and
                                storage_map[i][0] is storage_map[o][0]):
                            return [True, storage_map[o][0]]
            return [False, None]

        assert check_storage(storage_map)[0]
        assert len(set(id(v) for v in
                       itervalues(storage_map))) < len(storage_map)
Example #14
0
def get_bivariate_normal_spec():
    X1,X2,mu,sigma = [T.scalar('X1'),T.scalar('X2'), T.vector('mu'), T.matrix('sigma')]
    GaussianDensitySpec = FunctionSpec(variables=[X1, X2, mu, sigma],
                                       output_expression = -0.5*T.dot(T.dot((T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu).T,
                                                                            nlinalg.matrix_inverse(sigma)),
                                                                      (T.concatenate([X1.dimshuffle('x'),X2.dimshuffle('x')])-mu)))
    return GaussianDensitySpec
Example #15
0
def adam(loss, param_list):
    """
    Recommended default settings are 
    α = 0.001, β1 = 0.9, β2 = 0.999 and  eps= 10e−8.
    t is timestep.
    """
    alpha = T.scalar("alpha")
    beta1 = T.scalar("beta1")
    beta2 = T.scalar("beta2")
    eps = T.scalar("eps")
    t = T.scalar("t")
    
    gparam_list = [T.grad(loss, p) for p in param_list]
    first_moment_list = [zero_shared(p.shape.eval()) for p in param_list]
    second_moment_list = [zero_shared(p.shape.eval()) for p in param_list]
    
    updates = OrderedDict()
    for param, gparam, first_moment, second_moment\
    in zip(param_list, gparam_list, first_moment_list, second_moment_list):
        m = beta1*first_moment + (1.-beta1)*gparam
        v = beta2*second_moment + (1.-beta2)*gparam*gparam
        m_hat = m / (1.-beta1**t)
        v_hat = v / (1.-beta2**t)
        updates[param] = param - alpha*m_hat / (T.sqrt(v_hat)+eps)
        updates[first_moment] = m
        updates[second_moment] = v
        
    opt_params = [alpha, beta1, beta2, eps, t]
    
    return updates, opt_params
Example #16
0
    def pretraining_functions(self, train_set_x, batch_size):

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            # get the cost and the updates list
            cost, updates = dA.get_cost_updates(corruption_level,##$
                                                learning_rate)
            # compile the theano function
            fn = theano.function(
                inputs=[
                    index,
                    theano.Param(corruption_level, default=0.2),##$
                    theano.Param(learning_rate, default=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin: batch_end]
                }
            )
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
Example #17
0
def create_TrainFunc_tranPES(simfn, embeddings,  marge=0.5, alpha=1., beta=1.):

    # parse the embedding data
    embedding = embeddings[0] # D x N matrix
    lembedding = embeddings[1]

    # declare the symbolic variables for training triples
    hp = S.csr_matrix('head positive') # N x batchsize matrix
    rp = S.csr_matrix('relation')
    tp = S.csr_matrix('tail positive')

    hn = S.csr_matrix('head negative')
    tn = S.csr_matrix('tail negative')

    lemb = T.scalar('embedding learning rate')
    lremb = T.scalar('relation learning rate')

    subtensorE = T.ivector('batch entities set')
    subtensorR = T.ivector('batch link set')

    # Generate the training positive and negative triples
    hpmat = S.dot(embedding.E, hp).T #  batchsize x D dense matrix
    rpmat = S.dot(lembedding.E, rp).T
    tpmat = S.dot(embedding.E, tp).T

    hnmat = S.dot(embedding.E, hn).T
    tnmat = S.dot(embedding.E, tn).T

    # calculate the score
    pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat)


    negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat)
    negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat)

    costh, outh = margeCost(pos, negh, marge)
    costt, outt = margeCost(pos, negt, marge)

    embreg = regEmb(embedding, subtensorE, alpha)
    lembreg = regLink(lembedding, subtensorR, beta)
    

    cost = costh + costt + embreg[0] + lembreg
    out = T.concatenate([outh, outt])
    outc = embreg[1]

    # list of inputs to the function
    list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR]

    # updating the embeddings using gradient descend
    emb_grad = T.grad(cost, embedding.E)
    New_embedding = embedding.E - lemb*emb_grad

    remb_grad = T.grad(cost, lembedding.E)
    New_rembedding = lembedding.E - lremb * remb_grad

    updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding})

    return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg],
                          updates=updates, on_unused_input='ignore')
Example #18
0
def directRNN():
    ####################### NumPy
    x0=0.5
    s=0.5
    times=[1,10,20,30,40,50]
    yhat=direct(x0, s, times)
    
    
    ############################### Symbolic
    x0_ = T.scalar("x0")
    c_= T.log((1-x0_)/x0_)
    times_ = T.ivector("times")
    S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S')
    yhat_= T.nnet.sigmoid(S__*times_/2-c_)
    Predict_ = theano.function(inputs=[x0_,times_], outputs=yhat_)
    
    
    ############################### Symbolic Recursive
    x0_ = T.scalar("x0")
    times_ = T.ivector("times")
    S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S')
#     predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1])
    predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: x_prev+(s*x_prev*(1-x_prev))/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1])
    pred_=predall_[times_-1] #we only have target at some generations e.g. 10,20,...
    Feedforward_ = theano.function(inputs=[x0_,times_], outputs=pred_, updates=updatesRecurrence_)

    ############################# Comparison
    x_0=0.5
    x_1=x_0+(s*x_0*(1-x_0))/(2*s*x_0+2)
    
    print '{:20s}{}'.format('NumPy', yhat)
    print '{:20s}{}'.format('Symbolic Direct', Predict_(x0,list(times)))
    print '{:20s}{}'.format('Symbolic Recursive', Feedforward_(x0,list(times)))
    print '{:20s}[ {} ]'.format('x_1', x_1)
Example #19
0
def build_nnet(layer_sizes, normalize_layers=False):
    X = T.vector(dtype='float32')
    t = T.scalar(dtype='int32')
    alpha = T.scalar(dtype='float32')
    t_onehot = extra.to_one_hot(t.reshape((1, 1)), 10)

    weights = []

    # We always want to normalize the inputs to the first layer
    Y, W = layer(normalize(X), 784, layer_sizes[0])
    weights.append(W)

    for l1, l2 in zip(layer_sizes[1:-1], layer_sizes[2:]):
        if normalize_layers:
            Y = normalize(Y)
        Y, W = layer(Y, l1, l2)
        weights.append(W)

    if normalize_layers:
        Y = normalize(Y)
    Y, W = layer(Y, layer_sizes[-1], 10, activation=nnet.softmax)
    weights.append(W)

    mse = T.mean(T.sqr(Y - t_onehot))
    updates = [(W, W - alpha * T.grad(cost=mse, wrt=W)) for W in weights]

    prediction = T.argmax(Y)
    confidence = T.max(Y)

    eval_nnet = theano.function(inputs=[X], outputs=[prediction, confidence])
    train_nnet = theano.function(inputs=[X, t, alpha], outputs=mse, updates=updates)

    return eval_nnet, train_nnet
Example #20
0
 def pretraining_functions(self, train_set_x, train_set_y, batch_size):
     index = tensor.lscalar('index')  
     index = tensor.lscalar('index')  
     corruption_level = tensor.scalar('corruption')  
     corruption_level = tensor.scalar('corruption')  
     learning_rate = tensor.scalar('lr')  
     learning_rate = tensor.scalar('lr')  
     switch = tensor.iscalar('switch')
     n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
     batch_begin = index * batch_size
     batch_end = batch_begin + batch_size
     pretrain_fns = []
     for sugar in self.sugar_layers:
         cost, updates = sugar.get_cost_updates(corruption_level,
                                             learning_rate,
                                             switch)
         fn = function(inputs=[index,
                                      Param(corruption_level, default=0.2),
                                      Param(learning_rate, default=0.1),
                                      Param(switch, default=1)],
                              outputs=[cost],
                              updates=updates,
                              givens={self.x: train_set_x[batch_begin:batch_end],
                                      self.y: train_set_y[batch_begin:batch_end]}, on_unused_input='ignore')
         pretrain_fns.append(fn)
     return pretrain_fns
Example #21
0
    def __init__(self, dnodex,inputdim,dim):
        X=T.ivector()
	Y=T.ivector()
	Z=T.lscalar()
	eta = T.scalar()
        temperature=T.scalar()
        self.dnodex=dnodex
        num_input = inputdim
	dnodex.umatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.nuser,inputdim, inputdim))))
        dnodex.pmatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.npoi,inputdim))))
        dnodex.p_l2_norm=(dnodex.pmatrix**2).sum()
        dnodex.u_l2_norm=(dnodex.umatrix**2).sum()
        num_hidden = dim
        num_output = inputdim
        inputs = InputPLayer(dnodex.pmatrix[X,:], dnodex.umatrix[Z,:,:], name="inputs")
        lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1")
        lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2")
        lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3")
        softmax = SoftmaxPLayer(num_hidden, num_output, dnodex.umatrix[Z,:,:], input_layer=lstm3, name="yhat", temperature=temperature)

        Y_hat = softmax.output()

        self.layers = inputs, lstm1,lstm2,lstm3,softmax
        params = get_params(self.layers)
        #caches = make_caches(params)

	cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(dnodex.pmatrix[Y,:],dnodex.umatrix[Z,:,:])))+eta*dnodex.p_l2_norm+eta*dnodex.u_l2_norm
        updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta)

        self.train = theano.function([X,Y,Z, eta, temperature], cost, updates=updates, allow_input_downcast=True)

        predict_updates = one_step_updates(self.layers)
        self.predict_char = theano.function([X, Z, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
Example #22
0
    def __init__(self, *args, learning_rate=0.001, decay=0.9, epsilon=1e-8,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.learning_rate = learning_rate0
        self.decay = deay0
        self.epsilon = epsilon0

        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        squares = self.create_shadows('squares')

        new_squares = [decay*square + (1.0-decay)*T.sqr(g)
                       for g, square in zip(
                           self.grad.values(), squares.values())]

        ds = [-g*learning_rate/T.sqrt(square + self.epsilon)
              for g,square in zip(self.grad.values(), new_squares)]

        updates = [(p, p+d) for p,d in zip(self.params.values(), ds)] \
                + list(zip(squares.values(), new_squares)) \

        self.step1 = function(
                inputs=self.inputs+self.outputs+[
                    learning_rate, decay],
                default_mode=1,
                outputs=self.loss,
                name='RMSProp_step1',
                updates=updates)
Example #23
0
    def __init__(self,final_momentum=0.9, initial_momentum=0.5,momentum_switchover=5,times=[10,20,30,40,50],S=3,lr=1e-2,maxIter=10000,initS=0.0,numReplicates=3,theta=20,n=2000):
        self.times=times[times!=0].astype(np.float32)
        self.momentum_ = T.scalar('momentum', dtype=floatX)
        self.final_momentum=final_momentum; self.initial_momentum=initial_momentum;self.momentum_switchover=momentum_switchover;self.W=3;self.lr=lr;self.maxIter=maxIter;self.numReplicates=numReplicates;self.initS=initS;self.n=n;self.theta=theta
        self.lr_ = T.scalar();self.target_ = (T.matrix(),T.vector())[self.numReplicates==1]; self.times_ = T.ivector("times"); self.x0_ = T.scalar("x0 ");self.n_ = T.scalar("n");self.theta_ = T.scalar("theta")
        
        
        self.S__=theano.shared(np.asarray(self.initS, dtype = floatX), 'S')
        self.predall_, self.updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=self.x0_,non_sequences=self.S__,n_steps=self.times_[-1])
        self.pred_=Z(self.predall_[self.times_-1],self.n_,self.theta_) #we only have target at some generations e.g. 10,20,...
        self.Feedforward_ = theano.function(inputs=[self.x0_,self.times_,self.n_,self.theta_], outputs=self.pred_, updates=self.updatesRecurrence_)
        
        
        
        if self.numReplicates==1:
            self.cost_ = 0.5*((self.target_ - self.pred_)**2).mean(axis=0).sum()
        else:
            self.cost_=0
            for j in range(self.numReplicates):
                self.cost_ += 0.5*((self.target_[:,j] - self.pred_)**2).mean(axis=0).sum()
        self.Loss_ = theano.function(inputs=[self.target_,self.pred_], outputs=self.cost_)
        self.gW_ = T.grad(self.cost_, [self.S__])[0]
        self.weightUpdate__ = theano.shared(np.asarray(0, dtype = floatX))
        
        upd = self.momentum_ * self.weightUpdate__ - self.lr_ * self.gW_
        self.updatesW=[(self.weightUpdate__,  upd),(self.S__, self.S__ + upd)]

        self.Objective_ = theano.function([self.x0_, self.target_, self.lr_,self.times_,self.momentum_,self.n_,self.theta_], self.cost_, on_unused_input='warn',updates=self.updatesW,allow_input_downcast=True)
    def pretraining_functions(self, train_set_x, batch_size):
    
       
        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch

        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
       
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size


        pretrain_fns = []
        for dA in self.dA_layers:
            cost, updates = dA.get_cost_updates(corruption_level, learning_rate)
            fn = theano.function(
                inputs=[
                    index,
                    corruption_level, 
                    learning_rate],
                    # http://stackoverflow.com/questions/35622784/what-is-the-right-way-to-pass-inputs-parameters-to-a-theano-function
                    #index, theano.In(corruption_level, value=0.2),     
                    #theano.In(learning_rate, value=0.1)], 
                    outputs=cost, updates=updates,
                givens={
                    self.x: train_set_x[batch_begin: batch_end] })
            pretrain_fns.append(fn)

        return pretrain_fns
Example #25
0
    def __theano_build__(self):
        params = self.params
        param_names = self.param_names
        hidden_dim = self.hidden_dim

        x1  = T.imatrix('x1')    # first sentence
        x2  = T.imatrix('x2')    # second sentence
        x1_mask = T.fmatrix('x1_mask')    #mask
        x2_mask = T.fmatrix('x2_mask')
        y   = T.ivector('y')     # label
        y_c = T.ivector('y_c')   # class weights 
        
        # Embdding words
        _E1 = params["E"].dot(params["W"][0]) + params["B"][0]
        _E2 = params["E"].dot(params["W"][1]) + params["B"][1]
        statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim])
        statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim])
        
        def rnn_cell(x, mx, ph, Wh):
            h = T.tanh(ph.dot(Wh) + x)
            h = mx[:, None] * h + (1-mx[:, None]) * ph
            return [h] 
            
        [h1], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex1, x1_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))],
            non_sequences=params["W"][2])
        
        [h2], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex2, x2_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=h1[-1])],
            non_sequences=params["W"][3])
       
        #predict
        _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"])
        _p = T.argmax(_s, axis=1)
        _c = T.nnet.categorical_crossentropy(_s, y)
        _c = T.sum(_c * y_c)
        _l = T.sum(params["lrW"]**2)
        _cost = _c + 0.01 * _l
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # Gradients and updates
        _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay)
        
        # Assign functions
        self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads)
        self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c)
        self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s)
        self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p)
        self.sgd_step = theano.function(
            [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay],
            updates=_updates)
Example #26
0
    def pretraining_functions(self, train_set_x, batch_size, k , weight_cost):

        index = T.lscalar('index')  
        momentum = T.scalar('momentum')
        learning_rate = T.scalar('lr') 
        # number of mini-batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # start and end index of this mini-batch
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:
            r_cost, fe_cost, updates = rbm.get_cost_updates(batch_size, learning_rate,
                                                            momentum, weight_cost,
                                                            persistent=None, k = k)

            # compile the theano function
            fn = theano.function(inputs=[index,
                              theano.Param(learning_rate, default=0.0001),
                              theano.Param(momentum, default=0.5)],
                              outputs= [r_cost, fe_cost],
                              updates=updates,
                              givens={self.x: train_set_x[batch_begin:batch_end]})
            # append function to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
Example #27
0
    def __init__(self, num_input, num_cells=50, num_output=1, lr=0.01, rho=0.95):
        X = T.matrix('x')
        Y = T.matrix('y')
        eta = T.scalar('eta')
        alpha = T.scalar('alpha')

        self.num_input = num_input
        self.num_output = num_output
        self.num_cells = num_cells
        self.eta = eta

        inputs = InputLayer(X, name="inputs")
        lstm = LSTMLayer(num_input, num_cells, input_layer=inputs, name="lstm")
        fc = FullyConnectedLayer(num_cells, num_output, input_layer=lstm)
        Y_hat = T.mean(fc.output(), axis=2)
        layer = inputs, lstm, fc
        self.params = get_params(layer)
        self.caches = make_caches(self.params)
        self.layers = layer
        mean_cost = T.mean((Y - Y_hat)**2)
        last_cost = T.mean((Y[-1] - Y_hat[-1])**2)
        self.cost = alpha*mean_cost + (1-alpha)*last_cost
        """"
        self.updates = momentum(self.cost, self.params, self.caches, self.eta, clip_at=3.0)
        """
        self.updates,_,_,_,_ = create_optimization_updates(self.cost, self.params, method="adadelta", lr= lr, rho=rho)
        self.train = theano.function([X, Y, alpha], [self.cost, last_cost] ,\
                updates=self.updates, allow_input_downcast=True)
        self.costfn = theano.function([X, Y, alpha], [self.cost, last_cost],\
                allow_input_downcast=True)
        self.predict = theano.function([X], [Y_hat], allow_input_downcast=True)
Example #28
0
    def __init__(self, *args, learning_rate=0.01, momentum=0.9, **kwargs):
        super().__init__(*args, **kwargs)

        self.learning_rate = learning_rate
        self.momentum = momentum

        learning_rate = T.scalar('learning_rate')
        momentum = T.scalar('momentum')

        vs = self.create_shadows('v')

        updates1 = [(p, p + momentum*v)
                    for p,v in zip(self.params.values(), vs.values())]

        updates2 = [(v, momentum*v - learning_rate*grad)
                    for v,grad in zip(vs.values(), self.grad.values())] \
                 + [(p, p - learning_rate*grad)
                    for p,grad in zip(self.params.values(),
                                      self.grad.values())]

        self.step1 = theano.function(
                inputs=[momentum],
                outputs=[],
                name='Nesterov_step1',
                updates=updates1)

        self.step2 = function(
                inputs=self.inputs+self.outputs+[
                    learning_rate, momentum],
                default_mode=1,
                outputs=self.loss,
                name='Nesterov_step2',
                updates=updates2)
Example #29
0
def get_update(Ws_s, bs_s):
    x, fx = train.get_model(Ws_s, bs_s)

    # Ground truth (who won)
    y = T.vector('y')

    # Compute loss (just log likelihood of a sigmoid fit)
    y_pred = sigmoid(fx)
    loss = -( y * T.log(y_pred) + (1 - y) * T.log(1 - y_pred)).mean()

    # Metrics on the number of correctly predicted ones
    frac_correct = ((fx > 0) * y + (fx < 0) * (1 - y)).mean()

    # Updates
    learning_rate_s = T.scalar(dtype=theano.config.floatX)
    momentum_s = T.scalar(dtype=theano.config.floatX)
    updates = train.nesterov_updates(loss, Ws_s + bs_s, learning_rate_s, momentum_s)
    
    f_update = theano.function(
        inputs=[x, y, learning_rate_s, momentum_s],
        outputs=[loss, frac_correct],
        updates=updates,
        )

    return f_update
Example #30
0
    def compile_functions(self, x, y):
        
        mb = T.scalar('mb',dtype='int64')
        lr = T.scalar('lr') 
        index = T.scalar('index',dtype='int64')

        print("Compiling theano functions...\n") 
        t0 = time.time()
        self.feed_forward = theano.function([x],self.model.out) 
        
        self.cost = self.model.cross_entropy_SGD(y)
        self.error = self.model.error_SGD(y)

        grad_params = [T.grad(self.cost, param) for param in self.model.params]
        updates = [(param, param-lr*gparam) for param, gparam in zip(self.model.params, grad_params)]
        
        
        self.train_model = theano.function(
            inputs = [index, lr,mb],
            outputs = self.cost, 
            updates = updates,
            givens = {
                x: self.dataset.in_train[(index*mb):(index+1)*mb],
                y: self.dataset.obs_train[(index*mb):(index+1)*mb],
            }
        )

        self.error = theano.function(
            inputs = [x,y],
            outputs = self.error,
        ) 
        print("Functions compiled. Took {:.2f} seconds".format(time.time() - t0))
Example #31
0
def test_aggregation_buffer_name_uniqueness():
    x1 = tensor.scalar('x')
    x2 = tensor.scalar('x')
    assert_raises_regex(ValueError, 'unique', AggregationBuffer, [x1, x2])
Example #32
0
def augment_system(ode_func, n_states, n_theta):
    """
    Function to create augmented system.

    Take a function which specifies a set of differential equations and return
    a compiled function which allows for computation of gradients of the
    differential equation's solition with repsect to the parameters.

    Uses float64 even if floatX=float32, because the scipy integrator always uses float64.

    Parameters
    ----------
    ode_func: function
        Differential equation.  Returns array-like.
    n_states: int
        Number of rows of the sensitivity matrix. (n_states)
    n_theta: int
        Number of ODE parameters

    Returns
    -------
    system: function
        Augemted system of differential equations.
    """

    # Present state of the system
    t_y = tt.vector("y", dtype="float64")
    t_y.tag.test_value = np.ones((n_states,), dtype="float64")
    # Parameter(s).  Should be vector to allow for generaliztion to multiparameter
    # systems of ODEs.  Is m dimensional because it includes all initial conditions as well as ode parameters
    t_p = tt.vector("p", dtype="float64")
    t_p.tag.test_value = np.ones((n_states + n_theta,), dtype="float64")
    # Time.  Allow for non-automonous systems of ODEs to be analyzed
    t_t = tt.scalar("t", dtype="float64")
    t_t.tag.test_value = 2.459

    # Present state of the gradients:
    # Will always be 0 unless the parameter is the inital condition
    # Entry i,j is partial of y[i] wrt to p[j]
    dydp_vec = tt.vector("dydp", dtype="float64")
    dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, "float64")

    dydp = dydp_vec.reshape((n_states, n_states + n_theta))

    # Get symbolic representation of the ODEs by passing tensors for y, t and theta
    yhat = ode_func(t_y, t_t, t_p[n_states:])
    # Stack the results of the ode_func into a single tensor variable
    if not isinstance(yhat, (list, tuple)):
        yhat = (yhat,)
    t_yhat = tt.stack(yhat, axis=0)

    # Now compute gradients
    J = tt.jacobian(t_yhat, t_y)

    Jdfdy = tt.dot(J, dydp)

    grad_f = tt.jacobian(t_yhat, t_p)

    # This is the time derivative of dydp
    ddt_dydp = (Jdfdy + grad_f).flatten()

    system = theano.function(
        inputs=[t_y, t_t, t_p, dydp_vec], outputs=[t_yhat, ddt_dydp], on_unused_input="ignore"
    )

    return system
Example #33
0
    def __init__(self,
                 n_dim,
                 n_out,
                 n_chan=1,
                 n_superbatch=12800,
                 opt_alg='adam',
                 opt_params={
                     'lr': 1e-3,
                     'b1': 0.9,
                     'b2': 0.99
                 }):
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))

        self.n_dim = n_dim
        self.n_out = n_out
        self.n_superbatch = n_superbatch
        self.alg = opt_alg
        self.n_class = 10

        lr = opt_params.get('lr')
        n_batch = opt_params.get('nb')

        train_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y_int = T.cast(train_set_y, 'int32')
        val_set_y_int = T.cast(val_set_y, 'int32')

        train_rbm_px_mu = theano.shared(
            np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX),
            borrow=False,
        )

        X = T.tensor4(dtype=theano.config.floatX)
        S = T.tensor3(dtype=theano.config.floatX)
        Y = T.ivector()
        px_mu = T.lscalar(dtype=config.floatX)
        idx1, idx2 = T.lscalar(), T.lscalar()
        alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
        self.inputs = (X, Y, idx1, idx2, S, px_mu)

        # ----------------------------
        # Begin RBM-only
        self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan)
        persistent_chain = theano.shared(
            np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX),
            borrow=True,
        )
        rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates(
            alpha,
            lr=lr,
            persistent=persistent_chain,
        )
        self.rbm_objectives = (rbm_cost, rbm_acc)
        self.rbm_train = theano.function(
            [idx1, idx2, alpha],
            [rbm_cost, rbm_acc],
            updates=rbm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2]
            },
            on_unused_input='warn',
        )
        # End RBM-only
        # ----------------------------
        # Begin DADGM-only
        tau = theano.shared(
            np.float32(5.0),
            name='temperature',
            allow_downcast=True,
            borrow=False,
        )
        self.tau = tau
        self.dadgm_network = self.create_dadgm_model(
            X,
            Y,
            n_dim,
            n_out,
            n_chan,
        )
        dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False)
        self.dadgm_objectives = (dadgm_loss, dadgm_acc)
        dadgm_params = self.get_dadgm_params()
        dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False)
        dadgm_updates = self.create_dadgm_updates(
            dadgm_grads,
            dadgm_params,
            alpha,
            opt_alg,
            opt_params,
        )
        self.dadgm_train = theano.function(
            [idx1, idx2, alpha],
            [dadgm_loss, dadgm_acc],
            updates=dadgm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2],
                px_mu: train_rbm_px_mu,
            },
            on_unused_input='warn',
        )
        self.dadgm_loss = theano.function(
            [X, Y],
            [dadgm_loss, dadgm_acc],
            on_unused_input='warn',
        )
        # End DADGM-only
        # ----------------------------
        self.n_batch = n_batch
        # parameters for sampling
        self.n_chain = 100

        # save data variables
        self.train_set_x = train_set_x
        self.train_set_y = train_set_y
        self.val_set_x = val_set_x
        self.val_set_y = val_set_y
        self.train_rbm_px_mu = train_rbm_px_mu
        self.data_loaded = False
Example #34
0
def single_layer_lstm(n_in, n_out):
    Wxb = theano.shared(np.random.randn(n_in, n_out), )
    Whb = theano.shared(np.random.randn(n_out, n_out), )
    bb = theano.shared(np.random.randn(n_out))

    Wxi = theano.shared(np.random.randn(n_in, n_out), )
    Whi = theano.shared(np.random.randn(n_out, n_out), )
    bi = theano.shared(np.random.randn(n_out))

    Wxf = theano.shared(np.random.randn(n_in, n_out), )
    Whf = theano.shared(np.random.randn(n_out, n_out), )
    bf = theano.shared(np.random.randn(n_out))

    Wxo = theano.shared(np.random.randn(n_in, n_out), )
    Who = theano.shared(np.random.randn(n_out, n_out), )
    bo = theano.shared(np.random.randn(n_out))

    Wo = theano.shared(np.random.randn(n_out, n_out))
    bout = theano.shared(np.random.randn(n_out))

    params = [Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout]

    def step(x,htm1,ctm1,Wxb,Whb,bb,\
             Wxi,Whi,bi,\
             Wxf,Whf,bf,\
             Wxo,Who,bo,Wo,bout):
        z = T.tanh(T.dot(x, Wxb) + T.dot(htm1, Whb) + bb)
        i = T.nnet.sigmoid(T.dot(x, Wxi) + T.dot(htm1, Whi) + bi)
        f = T.nnet.sigmoid(T.dot(x, Wxf) + T.dot(htm1, Whf) + bf)
        c = i * z + f * ctm1
        o = T.nnet.sigmoid(T.dot(x, Wxo) + T.dot(htm1, Who) + bo)
        h = o * T.tanh(c)
        y = T.dot(h, Wo) + bout
        return [h, c, y]

    X = T.matrix()
    h0 = T.vector()
    c0 = T.vector()
    yt = T.ivector()
    lr = T.scalar()
    mom = T.scalar()

    [h, c, y], _ = theano.scan(step,
                               sequences=X,
                               outputs_info=[h0, c0, None],
                               non_sequences=[
                                   Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf,
                                   Wxo, Who, bo, Wo, bout
                               ])

    yout = T.nnet.softmax(y)
    L2 = T.scalar()
    L2 = 0
    for param in params:
        L2 += (param**2).sum()

    L2 = 0.001 * L2

    def loss(y_pred, y_true):
        return -T.mean(T.log(y_pred)[T.arange(y_true.shape[0]), y_true])

    #oloss = loss(yout,yt)
    #cost = theano.function( [X,h0,c0,yt], oloss )
    funch = theano.function([X, h0, c0], c)
    funcy = theano.function([X, h0, c0], y)

    oloss = loss(yout, yt) + L2
    cost = loss(yout, yt)
    gparams = []
    for param in params:
        gparams.append(T.grad(oloss, param))

    # zip just concatenate two lists
    updates_t = {}

    for param in params:
        updates_t[param] = theano.shared(value=np.zeros(
            param.get_value(borrow=True).shape, dtype=theano.config.floatX),
                                         name='updates')

    updates = {}
    for param, gparam in zip(params, gparams):
        weight_update = updates_t[param]
        upd = mom * weight_update - lr * gparam
        updates[weight_update] = upd
        updates[param] = param + upd
    """
    for param, gparam in zip(params, gparams):
        #mparam = theano.shared(param.get_value()*0.)
        upd = -lr*gparam# + mom*mparam# - 0.01*param# + 
        #updates[mparam] = upd
        updates[param] = param + upd
    """
    """            
        weight_update = updates[param]
        upd = -lr * gparam - 0.01*param
        updates[weight_update] = upd
        updates[param] = param + upd
    """

    #gWxo = T.grad(oloss,Wxo)
    #fgradwxo = theano.function( [X,h0,c0,yt], gWxo )
    trainer = theano.function([X, h0, c0, yt, lr, mom], [cost],
                              updates=updates)
    return funcy, trainer
Example #35
0
    def func_update_policy(self, Tmax, use_x0=False, accumulators=None):
        U = tensor.tensor3('U')  # Inputs
        Q = tensor.tensor3('Q')  # Noise

        if use_x0:
            x0_ = tensor.matrix('x0_')
        else:
            x0 = self.policy_net.params['x0']
            x0_ = tensor.alloc(x0, U.shape[1], x0.shape[0])

        log_z_0 = self.policy_net.get_outputs_0(x0_, log=True)
        r, log_z = self.policy_net.get_outputs(U, Q, x0_, log=True)

        # Learning rate
        lr = tensor.scalar('lr')

        A = tensor.tensor3('A')
        R = tensor.matrix('R')
        b = tensor.matrix('b')
        M = tensor.matrix('M')

        logpi_0 = tensor.sum(log_z_0 * A[0], axis=-1) * M[0]
        logpi_t = tensor.sum(log_z * A[1:], axis=-1) * M[1:]

        # Entropy
        #entropy_0 = tensor.sum(tensor.exp(log_z_0)*log_z_0, axis=-1)*M[0]
        #entropy_t = tensor.sum(tensor.exp(log_z)*log_z, axis=-1)*M[1:]
        #entropy   = (tensor.sum(entropy_0) + tensor.sum(entropy_t))/tensor.sum(M)

        #def f(x):
        #    return -x**2/2/self.sigma**2

        #logpi_0 = tensor.sum(f(A[0] - z_0), axis=-1)*M[0]
        #logpi_t = tensor.sum(f(A[1:] - z), axis=-1)*M[1:]

        # Enforce causality
        Mcausal = theanotools.zeros((Tmax - 1, Tmax - 1))
        for i in xrange(Mcausal.shape[0]):
            Mcausal[i, i:] = 1
        Mcausal = theanotools.shared(Mcausal, 'Mcausal')

        J0 = logpi_0 * R[0]
        J0 = tensor.mean(J0)
        J = (logpi_t.T).dot(Mcausal).dot(R[1:] * M[1:])
        J = tensor.nlinalg.trace(J) / J.shape[0]

        J += J0

        # Second term
        Jb0 = logpi_0 * b[0]
        Jb0 = tensor.mean(Jb0)
        Jb = logpi_t * b[1:]
        Jb = tensor.mean(tensor.sum(Jb, axis=0))

        J -= Jb0 + Jb

        # Objective function
        obj = -J + self.policy_net.get_regs(x0_, r, M)  # + 0.0005*entropy

        # SGD
        self.policy_sgd = Adam(self.policy_net.trainables,
                               accumulators=accumulators)
        if self.policy_net.type == 'simple':
            i = self.policy_net.index('Wrec')
            grads = tensor.grad(obj, self.policy_net.trainables)
            grads[i] += self.policy_net.get_dOmega_dWrec(-J, r)
            norm, grads, updates = self.policy_sgd.get_updates(obj,
                                                               lr,
                                                               grads=grads)
        else:
            norm, grads, updates = self.policy_sgd.get_updates(obj, lr)

        if use_x0:
            args = [x0_]
        else:
            args = []
        args += [U, Q, A, R, b, M, lr]

        return theano.function(args, norm, updates=updates)
Example #36
0
    def fit(self,
            X_train,
            Y_train,
            X_test=None,
            Y_test=None,
            validation_frequency=100):
        """ Fit model

        Pass in X_test, Y_test to compute test error and report during
        training.

        X_train : ndarray (n_seq x n_steps x n_in)
        Y_train : ndarray (n_seq x n_steps x n_out)

        validation_frequency : int
            in terms of number of sequences (or number of weight updates)
        """
        if X_test is not None:
            assert (Y_test is not None)
            self.interactive = True
            test_set_x, test_set_y = self.shared_dataset((X_test, Y_test))
        else:
            self.interactive = False

        train_set_x, train_set_y = self.shared_dataset((X_train, Y_train))

        n_train = train_set_x.get_value(borrow=True).shape[0]
        if self.interactive:
            n_test = test_set_x.get_value(borrow=True).shape[0]

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        logger.info('... building the model')

        index = T.lscalar('index')  # index to a case
        # learning rate (may change)
        l_r = T.scalar('l_r', dtype=theano.config.floatX)
        mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum

        cost = self.rnn.loss(self.y) \
            + self.L1_reg * self.rnn.L1 \
            + self.L2_reg * self.rnn.L2_sqr

        compute_train_error = theano.function(
            inputs=[
                index,
            ],
            outputs=self.rnn.loss(self.y),
            givens={
                self.x: train_set_x[index],
                self.y: train_set_y[index]
            },
            mode=theano.compile.MonitorMode(post_func=self.detect_nan))
        #mode=mode)

        if self.interactive:
            compute_test_error = theano.function(inputs=[
                index,
            ],
                                                 outputs=self.rnn.loss(self.y),
                                                 givens={
                                                     self.x: test_set_x[index],
                                                     self.y: test_set_y[index]
                                                 },
                                                 mode=mode)

        # compute the gradient of cost with respect to theta = (W, W_in, W_out)
        # gradients on the weights using BPTT
        gparams = []
        for param in self.rnn.params:
            gparam = T.grad(cost, param)
            gparams.append(gparam)

        updates = {}
        for param, gparam in zip(self.rnn.params, gparams):
            weight_update = self.rnn.updates[param]
            upd = mom * weight_update - l_r * gparam
            updates[weight_update] = upd
            updates[param] = param + upd

        # compiling a Theano function `train_model` that returns the
        # cost, but in the same time updates the parameter of the
        # model based on the rules defined in `updates`
        train_model = theano.function(inputs=[index, l_r, mom],
                                      outputs=cost,
                                      updates=updates,
                                      givens={
                                          self.x: train_set_x[index],
                                          self.y: train_set_y[index]
                                      },
                                      mode=mode)

        ###############
        # TRAIN MODEL #
        ###############
        logger.info('... training')
        epoch = 0

        while (epoch < self.n_epochs):
            epoch = epoch + 1
            for idx in xrange(n_train):
                effective_momentum = self.final_momentum \
                               if epoch > self.momentum_switchover \
                               else self.initial_momentum
                example_cost = train_model(idx, self.learning_rate,
                                           effective_momentum)

                # iteration number (how many weight updates have we made?)
                # epoch is 1-based, index is 0 based
                iter = (epoch - 1) * n_train + idx + 1

                if iter % validation_frequency == 0:
                    # compute loss on training set
                    train_losses = [
                        compute_train_error(i) for i in xrange(n_train)
                    ]
                    this_train_loss = np.mean(train_losses)

                    if self.interactive:
                        test_losses = [
                            compute_test_error(i) for i in xrange(n_test)
                        ]
                        this_test_loss = np.mean(test_losses)

                        logger.info('epoch %i, seq %i/%i, tr loss %f '
                                    'te loss %f lr: %f' % \
                        (epoch, idx + 1, n_train,
                         this_train_loss, this_test_loss, self.learning_rate))
                    else:
                        logger.info('epoch %i, seq %i/%i, train loss %f '
                                    'lr: %f' % \
                                    (epoch, idx + 1, n_train, this_train_loss,
                                     self.learning_rate))

            self.learning_rate *= self.learning_rate_decay
Example #37
0
    def ready(self):
        # input (where first dimension is time)
        self.x = T.matrix()
        # target (where first dimension is time)
        if self.output_type == 'real':
            self.y = T.matrix(name='y', dtype=theano.config.floatX)
        elif self.output_type == 'binary':
            self.y = T.matrix(name='y', dtype='int32')
        elif self.output_type == 'softmax':  # only vector labels supported
            self.y = T.vector(name='y', dtype='int32')
        else:
            raise NotImplementedError
        # initial hidden state of the RNN
        self.h0 = T.vector()
        # learning rate
        self.lr = T.scalar()

        if self.activation == 'tanh':
            activation = T.tanh
        elif self.activation == 'sigmoid':
            activation = T.nnet.sigmoid
        elif self.activation == 'relu':
            activation = lambda x: x * (x > 0)
        elif self.activation == 'cappedrelu':
            activation = lambda x: T.minimum(x * (x > 0), 6)
        else:
            raise NotImplementedError

        self.rnn = RNN(input=self.x,
                       n_in=self.n_in,
                       n_hidden=self.n_hidden,
                       n_out=self.n_out,
                       activation=activation,
                       output_type=self.output_type,
                       use_symbolic_softmax=self.use_symbolic_softmax)

        if self.output_type == 'real':
            self.predict = theano.function(inputs=[
                self.x,
            ],
                                           outputs=self.rnn.y_pred,
                                           mode=mode)
        elif self.output_type == 'binary':
            self.predict_proba = theano.function(inputs=[
                self.x,
            ],
                                                 outputs=self.rnn.p_y_given_x,
                                                 mode=mode)
            self.predict = theano.function(inputs=[
                self.x,
            ],
                                           outputs=T.round(
                                               self.rnn.p_y_given_x),
                                           mode=mode)
        elif self.output_type == 'softmax':
            self.predict_proba = theano.function(inputs=[
                self.x,
            ],
                                                 outputs=self.rnn.p_y_given_x,
                                                 mode=mode)
            self.predict = theano.function(inputs=[
                self.x,
            ],
                                           outputs=self.rnn.y_out,
                                           mode=mode)
        else:
            raise NotImplementedError
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset=DataSet,
                    nkerns=[cls1, cls2],
                    batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print(type(train_set_x))

    #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540])
    #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540])
    #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540])

    #train_set_x = train_set_x / 100
    #valid_set_x = valid_set_x / 100
    #test_set_x = test_set_x / 100

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0)

    print(n_test_batches)
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    Alr = T.scalar('Alr')
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (nFB, nFs)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    dFeatureV = iFMs * nFB * nFs
    xinp = x[:, :dFeatureV]

    #    print (x.shahpe)

    layer0_input = xinp.reshape((batch_size, iFMs, nFB, nFs))
    layer1H_input = x[:, dFeatureV:]
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, iFMs, nFB, nFs),
                                filter_shape=(nkerns[0], iFMs, fsx, fsy),
                                poolsize=(1, p))
    cl2x = (nFB - fsx + 1) / 1
    cl2y = (nFs - fsy + 1) / p
    layer1H = HiddenLayer(rng,
                          input=layer1H_input,
                          n_in=27,
                          n_out=nhu1 / 4,
                          activation=T.tanh)
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)

    #layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
    #        image_shape=(batch_size, nkerns[0], cl2x, cl2y),
    #        filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1))
    #hl1 = (cl2x - fsx + 1)/p2
    hl1 = cl2x * cl2y
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer0.output.flatten(2)
    #layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1)
    layer2_inputT = T.concatenate([layer2_input, layer1H.output], axis=1)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_inputT,
                         n_in=(nkerns[0] * hl1 * 1) + nhu1 / 4,
                         n_out=nhu1 * 2,
                         activation=T.tanh)

    layer22 = HiddenLayer(rng,
                          input=layer2.output,
                          n_in=nhu1 * 2,
                          n_out=nhu1,
                          activation=T.tanh)

    layer23 = HiddenLayer(rng,
                          input=layer22.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer23.output, n_in=nhu1, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    #yPred = layer3.ypred(layer2.output)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index], [layer3.errors(y), layer3.y_pred],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    #params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params
    params = layer3.params + layer23.params + layer22.params + layer2.params + layer1H.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        #updates.append((param_i, param_i - learning_rate * grad_i))
        updates.append((param_i, param_i - Alr * grad_i))

    train_model = theano.function(
        [index, Alr],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size][:],
            y: train_set_y[index * batch_size:(index + 1) * batch_size][:]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    #best_params = None
    best_params = []
    best_validation_loss = numpy.inf
    prev_validation_loss = 200

    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    Alrc = 0.1
    AlrE = 0.00001
    epochC = 0
    epoch = 0
    done_looping = False
    for param in params:
        best_params.append(param.get_value())
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        epochC = epochC + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, Alrc)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                lossratio = (this_validation_loss -
                             prev_validation_loss) / (prev_validation_loss + 1)
                print(lossratio)
                print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100., Alrc))

                # if we got the best validation score until now
                #if this_validation_loss < best_validation_loss:
                if lossratio <= 0.0:
                    for i in range(len(params)):
                        best_params[i] = params[i].get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    prev_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #tm =  test_model(0)

                    yP = numpy.asarray([])
                    test_losses = [
                        test_model(i)[0] for i in xrange(n_test_batches)
                    ]
                    for i in xrange(n_test_batches):
                        yP = numpy.concatenate((yP, test_model(i)[1]))
                    print(yP.shape)
                    test_score = numpy.mean(test_losses)

                    #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value())
                    #y = test_set_y.owner.inputs[0].get_value()[:2300]
                    y = yP

                    print(yP.shape)
                    print(y.shape)
                    I1 = numpy.nonzero(y == 0.0)
                    I2 = numpy.nonzero(y == 1.0)
                    I3 = numpy.nonzero(y == 2.0)
                    I4 = numpy.nonzero(y == 3.0)

                    print(I1[0].shape)
                    print(I2[0].shape)
                    print(I3[0].shape)
                    print(I4[0].shape)
                    I11 = numpy.nonzero(yP[I1[0]] == 0)
                    I12 = numpy.nonzero(yP[I1[0]] == 1)
                    I13 = numpy.nonzero(yP[I1[0]] == 2)
                    I14 = numpy.nonzero(yP[I1[0]] == 3)
                    I21 = numpy.nonzero(yP[I2[0]] == 0)
                    I22 = numpy.nonzero(yP[I2[0]] == 1)
                    I23 = numpy.nonzero(yP[I2[0]] == 2)
                    I24 = numpy.nonzero(yP[I2[0]] == 3)
                    I31 = numpy.nonzero(yP[I3[0]] == 0)
                    I32 = numpy.nonzero(yP[I3[0]] == 1)
                    I33 = numpy.nonzero(yP[I3[0]] == 2)
                    I34 = numpy.nonzero(yP[I3[0]] == 3)
                    I41 = numpy.nonzero(yP[I4[0]] == 0)
                    I42 = numpy.nonzero(yP[I4[0]] == 1)
                    I43 = numpy.nonzero(yP[I4[0]] == 2)
                    I44 = numpy.nonzero(yP[I4[0]] == 3)

                    acc1 = 100  #float(float(I11[0].size)/float(I1[0].size))
                    acc2 = 100  #float(float(I22[0].size)/float(I2[0].size))
                    if n_out == 3:
                        acc3 = 100  #float(float(I33[0].size)/float(I3[0].size))
                        acc4 = 0
                    elif n_out == 4:
                        acc3 = float(float(I33[0].size) / float(I3[0].size))
                        acc4 = float(float(I44[0].size) / float(I4[0].size))
                    else:
                        acc3 = 0
                        acc4 = 0
                    print((
                        '     epoch %i, minibatch %i/%i, test error of '
                        'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%'
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100., acc1 * 100., acc2 * 100.,
                         acc3 * 100, acc4 * 100, I11[0].size, I12[0].size,
                         I13[0].size, I14[0].size, I21[0].size, I22[0].size,
                         I23[0].size, I24[0].size, I31[0].size, I32[0].size,
                         I33[0].size, I34[0].size, I41[0].size, I42[0].size,
                         I43[0].size, I44[0].size))

                    #print(('     epoch %i, minibatch %i/%i, test error of best '
                    #       'model %f %%') %
                    #      (epoch, minibatch_index + 1, n_train_batches,
                    #       test_score * 100.))
                else:
                    if Alrc <= AlrE:
                        done_looping = True
                        break
                    elif epochC > 40:
                        Alrc = Alrc / 2
                        for param, best_param in zip(params, best_params):
                            param.set_value(best_param)
                        epochC = 0
            #if patience <= iter:
            #    done_looping = True
            #    break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    #print >> sys.stderr, ('The code for file ' +
    #                      os.path.split(__file__)[1] +
    #                      ' ran for %.2fm' % ((end_time - start_time) / 60.))
    OF = open(outFile, 'a')
    print(DataSet,
          n_out,
          fsx,
          fsy,
          p,
          cls1,
          cls2,
          nhu1,
          nFB,
          nFs,
          iFMs,
          nhus,
          batch_size,
          test_score * 100.,
          acc1 * 100.,
          acc2 * 100.,
          acc3 * 100,
          acc4 * 100,
          I11[0].size,
          I12[0].size,
          I13[0].size,
          I14[0].size,
          I21[0].size,
          I22[0].size,
          I23[0].size,
          I24[0].size,
          I31[0].size,
          I32[0].size,
          I33[0].size,
          I34[0].size,
          I41[0].size,
          I42[0].size,
          I43[0].size,
          I44[0].size,
          file=OF)

    OF.close()
Example #39
0
def train(
        dim_word=100,
        dim_word_src=200,
        enc_dim=1000,
        dec_dim=1000,  # the number of LSTM units
        patience=-1,  # early stopping patience
        max_epochs=5000,
        finish_after=-1,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=1000,  # maximum length of the description
        maxlen_trg=1000,  # maximum length of the description
        maxlen_sample=1000,
        optimizer='rmsprop',
        batch_size=[1, 2, 3, 4],
        valid_batch_size=16,
        sort_size=20,
        save_path=None,
        save_file_name='model',
        save_best_models=0,
        dispFreq=100,
        validFreq=100,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=-1,
        pbatchFreq=-1,
        verboseFreq=10000,
        datasets=[
            'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        source_word_level=0,
        target_word_level=0,
        use_dropout=False,
        re_load=False,
        re_load_old_setting=False,
        uidx=None,
        eidx=None,
        cidx=None,
        layers=None,
        save_every_saveFreq=0,
        save_burn_in=20000,
        use_bpe=0,
        init_params=None,
        build_model=None,
        build_sampler=None,
        gen_sample=None,
        **kwargs):

    # Model options
    model_options = locals().copy()
    del model_options['init_params']
    del model_options['build_model']
    del model_options['build_sampler']
    del model_options['gen_sample']

    # load dictionaries and invert them
    # dictionaries[0] : src
    # dictionaries[1] : trg
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    # ii, dd : 0 = source, 1 = target
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = cPickle.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    print 'Building model'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_name = '%s%s.npz' % (save_path, save_file_name)
    best_file_name = '%s%s.best.npz' % (save_path, save_file_name)
    opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads')
    best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name,
                                              '.grads')
    model_name = '%s%s.pkl' % (save_path, save_file_name)
    params = init_params(model_options)
    cPickle.dump(model_options, open(model_name, 'wb'))
    history_errs = [[], [], [], []]

    # reload options
    # reload : False
    if re_load and os.path.exists(file_name):
        print 'You are reloading your experiment.. do not panic dude..'
        if re_load_old_setting:
            with open(model_name, 'rb') as f:
                models_options = cPickle.load(f)
        params = load_params(file_name, params)
        # reload history
        model = numpy.load(file_name)
        history_errs = list(lst.tolist() for lst in model['history_errs'])
        if uidx is None:
            uidx = model['uidx']
        if eidx is None:
            eidx = model['eidx']
        if cidx is None:
            try:
                cidx = model['cidx']
            except:
                cidx = 0
    else:
        if uidx is None:
            uidx = 0
        if eidx is None:
            eidx = 0
        if cidx is None:
            cidx = 0

    print 'Loading data'

    train = MultiTextIterator(source=datasets[0],
                              target=datasets[1],
                              source_dict=dictionaries[0],
                              target_dict=dictionaries[1],
                              n_words_source=n_words_src,
                              n_words_target=n_words,
                              source_word_level=source_word_level,
                              target_word_level=target_word_level,
                              batch_size=batch_size,
                              sort_size=sort_size)

    valid = [
        TextIterator(source=valid_dataset[0],
                     target=valid_dataset[1],
                     source_dict=dictionaries[0],
                     target_dict=dictionaries[1],
                     n_words_source=n_words_src,
                     n_words_target=n_words,
                     source_word_level=source_word_level,
                     target_word_level=target_word_level,
                     batch_size=valid_batch_size,
                     sort_size=sort_size) for valid_dataset in valid_datasets
    ]

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    # NOTE : this is where we build the model
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
    #print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost
    print 'Done'

    if re_load:  # NOTE : this whole thing is False
        use_noise.set_value(0.)
        valid_scores = []
        for ii, vv in enumerate(valid):

            valid_errs = pred_probs(f_log_probs,
                                    prepare_data,
                                    model_options,
                                    vv,
                                    verboseFreq=verboseFreq)
            valid_err = valid_errs.mean()

            if numpy.isnan(valid_err):
                import ipdb
                ipdb.set_trace()

            print 'Reload sanity check: Valid ', valid_err

    cost = cost.mean()

    # apply L2 regularization on weights
    # decay_c : 0
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    # alpha_c : 0
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    # NOTE : why is this not referenced somewhere later?
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    if clip_c > 0:
        grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c)
    else:
        not_finite = 0
        clipped = 0

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    if re_load and os.path.exists(file_name):
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped,
                file_name=opt_file_name)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr, tparams, grads, inps, cost=cost, file_name=opt_file_name)
    else:
        # re_load = False, clip_c = 1
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr,
                                                                  tparams,
                                                                  grads,
                                                                  inps,
                                                                  cost=cost)

            # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile)

            # f_update = theano.function([lr], [], updates=updates,
            #                   on_unused_input='ignore', profile=profile)
            # toptparams

    print 'Done'

    print 'Optimization'
    best_p = None
    bad_counter = 0

    # will never be true
    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    # Training loop
    ud_start = time.time()
    estop = False

    if re_load:
        # IndexError: index 14 is out of bounds for axis 1 with size 13
        print "Checkpointed minibatch number: %d" % cidx
        for cc in xrange(cidx):
            if numpy.mod(cc, 1000) == 0:
                print "Jumping [%d / %d] examples" % (cc, cidx)
            train.next()

    for epoch in xrange(max_epochs):
        time0 = time.time()
        n_samples = 0
        NaN_grad_cnt = 0
        NaN_cost_cnt = 0
        clipped_cnt = 0
        update_idx = 0
        if re_load:
            re_load = 0
        else:
            cidx = 0

        for x, y in train:
            # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length
            update_idx += 1
            cidx += 1
            uidx += 1
            use_noise.set_value(1.)

            # NOTE : n_x <= batch_size
            x, x_mask, y, y_mask, n_x = prepare_data(x,
                                                     y,
                                                     maxlen=maxlen,
                                                     maxlen_trg=maxlen_trg,
                                                     n_words_src=n_words_src,
                                                     n_words=n_words)
            n_samples += n_x

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                uidx = max(uidx, 0)
                continue

            # compute cost, grads and copy grads to shared variables

            if clip_c > 0:
                cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if clipped:
                clipped_cnt += 1

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                import ipdb
                ipdb.set_trace()
                NaN_cost_cnt += 1

            if not_finite:
                import ipdb
                ipdb.set_trace()
                NaN_grad_cnt += 1
                continue

            # do the update on parameters
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                continue

            if float(NaN_grad_cnt) > max_epochs * 0.5 or float(
                    NaN_cost_cnt) > max_epochs * 0.5:
                print 'Too many NaNs, abort training'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                wps = n_samples / float(time.time() - time0)
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\
                      'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps
                ud_start = time.time()

            if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1:
                pbatch(x, worddicts_r[0])

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1:

                gen_list = [
                    0, batch_size[0], batch_size[0] + batch_size[1],
                    batch_size[0] + batch_size[1] + batch_size[2]
                ]
                gen_list = [ii for ii in gen_list if ii < n_x]

                for jj in gen_list:
                    # jj = min(5, n_samples)
                    stochastic = True
                    use_noise.set_value(0.)

                    # x : maxlen X n_samples
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=maxlen_sample,
                                               stochastic=stochastic,
                                               argmax=False)
                    print
                    print 'Source ', jj, ': ',
                    if source_word_level:
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                if use_bpe:
                                    print(worddicts_r[0][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[0][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        source_ = []
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                source_.append(worddicts_r[0][vv])
                            else:
                                source_.append('UNK')
                        print "".join(source_)
                    print 'Truth ', jj, ' : ',
                    if target_word_level:
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        truth_ = []
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                truth_.append(worddicts_r[1][vv])
                            else:
                                truth_.append('UNK')
                        print "".join(truth_)
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    if target_word_level:
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        sample_ = []
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                sample_.append(worddicts_r[1][vv])
                            else:
                                sample_.append('UNK')
                        print "".join(sample_)
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                valid_scores = []
                for ii, vv in enumerate(valid):
                    use_noise.set_value(0.)
                    # NOTE : when validation, don't pass maxlen, maxlen_trg
                    # meaning, don't limit sentence lengths...
                    # sort of makes sense i suppose?
                    valid_errs = pred_probs(
                        f_log_probs,
                        prepare_data,
                        model_options,
                        vv,
                        verboseFreq=verboseFreq,
                    )
                    valid_err = valid_errs.mean()
                    valid_scores.append(valid_err)
                    history_errs[ii].append(valid_err)

                    # patience == -1, never happens
                    if len(history_errs[ii]) > patience and valid_err >= \
                            numpy.array(history_errs[ii])[:-patience].min() and patience != -1:
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    if numpy.isnan(valid_err):
                        import ipdb
                        ipdb.set_trace()

                cnt = 0
                for ii in xrange(4):
                    if uidx == 0 or valid_scores[ii] <= numpy.array(
                            history_errs[ii]).min():
                        cnt += 1

                if len(history_errs[0]) > 1:
                    if numpy.sum(valid_scores) <= numpy.sum(
                        [aa[:-2] for aa in history_errs]):
                        less_sum = True
                    else:
                        less_sum = False
                else:
                    less_sum = True

                if cnt >= 2 and less_sum:
                    best_p = unzip(tparams)
                    best_optp = unzip(toptparams)
                    bad_counter = 0

                if saveFreq != validFreq and save_best_models:
                    numpy.savez(best_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cdix,
                                **best_p)
                    numpy.savez(best_opt_file_name, **best_optp)

                print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
                    valid_scores[0], valid_scores[1], valid_scores[2],
                    valid_scores[3])

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if not os.path.exists(save_path):
                    os.mkdir(save_path)

                params = unzip(tparams)
                optparams = unzip(toptparams)
                numpy.savez(file_name,
                            history_errs=history_errs,
                            uidx=uidx,
                            eidx=eidx,
                            cidx=cidx,
                            **params)
                numpy.savez(opt_file_name, **optparams)

                if save_every_saveFreq and (uidx >= save_burn_in):
                    this_file_name = '%s%s.%d.npz' % (save_path,
                                                      save_file_name, uidx)
                    this_opt_file_name = '%s%s%s.%d.npz' % (
                        save_path, save_file_name, '.grads', uidx)
                    numpy.savez(this_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    numpy.savez(this_opt_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    if best_p is not None and saveFreq != validFreq:
                        this_best_file_name = '%s%s.%d.best.npz' % (
                            save_path, save_file_name, uidx)
                        numpy.savez(this_best_file_name,
                                    history_errs=history_errs,
                                    uidx=uidx,
                                    eidx=eidx,
                                    cidx=cidx,
                                    **best_p)
                print 'Done...',
                print 'Saved to %s' % file_name

            # finish after this many updates
            if uidx >= finish_after and finish_after != -1:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples
        lang_nos = (4535523, 12122376, 1926115, 2326893)
        lang_done = [x * update_idx for x in batch_size]
        lang_rem = [x - y for x, y in zip(lang_nos, lang_done)]
        print "Remaining : DE({}), CS({}), FI({}), RU({})".format(
            lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3])
        eidx += 1

        if estop:
            break

    use_noise.set_value(0.)

    valid_scores = []
    for ii, vv in enumerate(valid):
        valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                               vv).mean()
        valid_scores.append(valid_err)

    print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
        valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3])

    params = unzip(tparams)
    optparams = unzip(toptparams)
    file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx)
    opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads',
                                       uidx)
    numpy.savez(file_name,
                history_errs=history_errs,
                uidx=uidx,
                eidx=eidx,
                cidx=cidx,
                **params)
    numpy.savez(opt_file_name, **optparams)
    if best_p is not None and saveFreq != validFreq:
        best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx)
        best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name,
                                                     '.grads', uidx)
        numpy.savez(best_file_name,
                    history_errs=history_errs,
                    uidx=uidx,
                    eidx=eidx,
                    cidx=cidx,
                    **best_p)
        numpy.savez(best_opt_file_name, **best_optp)

    return valid_err
Example #40
0
    def fit(self, X, y):
        batchsize = self.batchsize

        n_valid = int(min(self.validset_max_examples, self.validset_fraction * X.shape[0]))
        # increase to a multiple of batchsize
        while n_valid % batchsize:
            n_valid += 1

        n_train = X.shape[0] - n_valid

        # decrease to a multiple of batchsize
        while n_train % batchsize:
            n_train -= 1

        if self.center_and_normalize and self.copy_X:
            X = X.copy()

        train_features = X[:n_train]
        valid_features = X[n_train:]
        train_labels = y[:n_train]
        valid_labels = y[n_train:]

        if self.center_and_normalize:
            print("Computing mean and std.dev")

            #this loop seems more memory efficient than numpy
            m= np.zeros(train_features.shape[1])
            msq= np.zeros(train_features.shape[1])
            for i in xrange(train_features.shape[0]):
                alpha = 1.0 / (i+1)
                v = train_features[i]
                m = alpha * v + (1-alpha)*m
                msq = alpha * v*v + (1-alpha)*msq

            self.X_mean_ = theano.shared(m.astype(X.dtype))
            self.X_std_ = theano.shared(
                    np.maximum(
                        self.min_feature_std,
                        np.sqrt(msq - m*m)).astype(X.dtype))

            X -= self.X_mean_.get_value()
            X /= self.X_std_.get_value()

        x_i = tensor.matrix(dtype=X.dtype)
        y_i = tensor.vector(dtype=y.dtype)
        lr = tensor.scalar(dtype=X.dtype)

        feature_logreg = LogisticRegression.new(x_i,
                n_in = train_features.shape[1], n_out=self.n_classes,
                dtype=x_i.dtype)

        if self.loss_fn=='log':
            traincost = feature_logreg.nll(y_i).sum()
        elif self.loss_fn=='hinge':
            raw_output = tensor.dot(feature_logreg.input, feature_logreg.w)+feature_logreg.b
            traincost = multi_hinge_margin(raw_output, y_i).sum()
        else:
            raise NotImplementedError(self.loss_fn)
        traincost = traincost + abs(feature_logreg.w).sum() * self.l1_regularization
        traincost = traincost + (feature_logreg.w**2).sum() * self.l2_regularization
        train_logreg_fn = theano.function([x_i, y_i, lr],
                [feature_logreg.nll(y_i).mean(),
                    feature_logreg.errors(y_i).mean()],
                updates=pylearn.gd.sgd.sgd_updates(
                    params=feature_logreg.params,
                    grads=tensor.grad(traincost, feature_logreg.params),
                    stepsizes=[lr/batchsize,lr/(10*batchsize)]))

        test_logreg_fn = theano.function([x_i, y_i],
                feature_logreg.errors(y_i))

        if self.center_and_normalize:
            feature_logreg_test = LogisticRegression(
                    (x_i - self.X_mean_)/self.X_std_,
                    feature_logreg.w,
                    feature_logreg.b)
            self.predict_fn_ = theano.function([x_i], feature_logreg_test.argmax)
        else:
            self.predict_fn_ = theano.function([x_i], feature_logreg.argmax)

        best_epoch = -1
        best_epoch_valid = -1
        best_epoch_train = -1
        best_epoch_test = -1
        valid_rate=-1
        test_rate=-1
        train_rate=-1

        for epoch in xrange(self.n_epochs):
            # validate
            # Marc'Aurelio, you crazy!!
            # the division by batchsize is done in the cost function
            e_lr = np.float32(self.learnrate / max(1.0, np.floor(max(1.,
                (epoch+1)/float(self.anneal_epoch))-2)))

            if n_valid:
                l01s = []
                for i in xrange(n_valid/batchsize):
                    x_i = valid_features[i*batchsize:(i+1)*batchsize]
                    y_i = valid_labels[i*batchsize:(i+1)*batchsize]

                    #lr=0.0 -> no learning, safe for validation set
                    l01 = test_logreg_fn((x_i), y_i)
                    l01s.append(l01)
                valid_rate = 1-np.mean(l01s)
                #print('Epoch %i validation accuracy: %f'%(epoch, valid_rate))

                if valid_rate > best_epoch_valid:
                    best_epoch = epoch
                    best_epoch_test = test_rate
                    best_epoch_valid = valid_rate
                    best_epoch_train = train_rate

                print('Epoch=%i best epoch %i valid %f test %f best train %f current train %f'%(
                    epoch, best_epoch, best_epoch_valid, best_epoch_test, best_epoch_train, train_rate))
                if epoch > self.anneal_epoch and epoch > 2*best_epoch:
                    break
            else:
                print('Epoch=%i current train %f'%( epoch, train_rate))

            #train
            l01s = []
            nlls = []
            for i in xrange(n_train/batchsize):
                x_i = train_features[i*batchsize:(i+1)*batchsize]
                y_i = train_labels[i*batchsize:(i+1)*batchsize]
                nll, l01 = train_logreg_fn((x_i), y_i, e_lr)
                nlls.append(nll)
                l01s.append(l01)
            train_rate = 1-np.mean(l01s)
    def __init__(self,
                 feature_count,
                 transformer,
                 k=8,
                 stdev=0.1,
                 X_format="dense"):
        # ************************************************************
        # * Option Processing
        # ************************************************************

        self.X_format = str(X_format).lower()
        if self.X_format not in _SUPPORTED_FORMATS:
            raise ValueError("Unsupported format: {}").format(X_format)

        d = feature_count

        # ************************************************************
        # * Symbolic Variables
        # ************************************************************

        # design matrix
        if X_format == "dense":
            self.X = T.matrix()
        elif X_format == "csr":
            self.X = S.csr_matrix()
        elif X_format == "csc":
            self.X = S.csc_matrix()
        self.y = T.vector()  # response
        self.s = T.vector()  # sample weights
        self.e = T.scalar()  # current epoch

        # ************************************************************
        # * Model Parameters
        # ************************************************************

        # bias term (intercept)
        w0_init = np.zeros(1)
        self.w0 = theano.shared(w0_init, allow_downcast=True)
        # first order coefficients
        w1_init = np.zeros(d)
        self.w1 = theano.shared(w1_init, allow_downcast=True)
        # interaction factors
        v_init = stdev * np.random.randn(k, d)
        self.v = theano.shared(v_init, allow_downcast=True)

        # ************************************************************
        # * The Model
        # ************************************************************

        dot = T.dot
        mul = T.mul
        if X_format in ("csc", "csr"):
            dot = S.dot
            mul = S.mul

        # The formula for pairwise interactions is from the bottom left
        # of page 997 of Rendle 2010, "Factorization Machines."
        # This version scales linearly in k and d, as opposed to O(d^2).
        interactions = 0.5 * T.sum((dot(self.X, T.transpose(self.v)) ** 2) \
                                   - dot(mul(self.X, self.X), T.transpose(self.v ** 2)), axis=1)
        self.y_hat = self.w0[0] + dot(self.X, self.w1) + interactions
        self.y_hat = transformer.transform(self.y_hat)

        # ************************************************************
        # * Prediction
        # ************************************************************

        self.theano_predict = theano.function(inputs=[self.X],
                                              outputs=self.y_hat,
                                              allow_input_downcast=True)
        h1 *= castx(srng.binomial(n=1, p=0.5, size=h1.shape))
    else: 
        h1 *= 0.5

    h2 = activation(T.dot(h1, params["W2_d"]) + params["b2_d"])

    if trainMode: 
        h2 *= castx(srng.binomial(n=1, p=0.5, size=h2.shape))
    else: 
        h2 *= 0.5

    y = T.dot(T.concatenate([h2], axis = 1), params["W3_d"]) + params["b3_d"]
    return T.nnet.sigmoid(y)


learning_rate = T.scalar()
x = T.matrix()
#z = T.matrix()

#z = srng.normal(avg = 0,std = 1, size = (100, var_dimensionality))
z2 = srng.binomial(size = (100,var_dimensionality / 4), n = 1, p = 0.5, dtype = 'float32')
z3 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32')
z4 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32')
z5 = srng.multinomial(size = (100,), n = 1, pvals = [1.0 / (var_dimensionality / 4)] * (var_dimensionality / 4), dtype = 'float32')

z = T.concatenate([z2,z3,z4,z5], axis = 1)

#z = T.erfinv(z)

#Value between 0 and 1 corresponding to the probability that a point belongs to the true data distribution
discriminator_true_value = discriminator_network(x, discriminator_params, trainMode = True)
import theano
from theano import tensor as T
import numpy as np

trX = np.linspace(-1, 1, 101)
print(trX)
trY = 2 * trX + np.random.randn(*trX.shape) * 0.33
print(trY)

X = T.scalar()
Y = T.scalar()

def model(X, w):
    return X * w

w = theano.shared(np.asarray(-1000., dtype=theano.config.floatX))
y = model(X, w)

cost = T.mean(T.sqr(y - Y))
gradient = T.grad(cost=cost, wrt=w)
updates = [[w, w - gradient * 0.01]]

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)

for i in range(100):
    for x, y in zip(trX, trY):
        """print (x, y)"""
        v = train(x, y)
        """print (v)"""

"""print(v)"""
Example #44
0
def train_mlp_probe(train_labels, train_samples, test_labels, test_samples,
                    hyperparams):

    batch_size = hyperparams['batch_size']
    learning_rate = hyperparams['learning_rate']
    n_epochs = hyperparams['n_epochs']
    lambda_reg = hyperparams['lambda_reg']
    num_hidden = hyperparams['num_hidden']
    num_hidden_2 = hyperparams['num_hidden_2']

    borrow = True

    arr = np.arange(train_labels.shape[0])
    np.random.shuffle(arr)

    train_samples_x = train_samples[arr, :]
    train_samples_y = train_labels

    if len(train_labels.shape) == 1:
        train_samples_y.shape = (train_samples_y.shape[0], 1)

    train_samples_y = train_samples_y[arr, :]

    train_set_x = theano.shared(np.asarray(train_samples_x,
                                           dtype=theano.config.floatX),
                                borrow=borrow)
    train_set_y = theano.shared(np.asarray(train_samples_y,
                                           dtype=theano.config.floatX),
                                borrow=borrow)

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.matrix('y')  # the labels are presented as 1D vector of
    # [int] labels
    learning_rate_t = T.scalar('learning_rate')  # [int] labels

    num_out = train_set_y.shape[1].eval()
    num_in = train_samples_x.shape[1]

    # construct the logistic regression class
    # classifier = LogisticRegressionCrossEnt(input=x, n_in=num_in, n_out=num_out, lambda_reg=lambda_reg)

    # for random weight intialisation
    rng = np.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=num_in,
                     n_hidden=num_hidden,
                     n_hidden_2=num_hidden_2,
                     n_out=num_out)
    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.euclidean_loss(y) + lambda_reg * classifier.L2_sqr

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    updates = []
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(classifier.params, gparams):
        updates.append((param, param - learning_rate_t * gparam))

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index, learning_rate_t],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    #print '... training the model'
    # early-stopping parameters
    start_time = time.clock()

    validation_scores = np.array([])
    costs = np.array([])

    moving_scores = np.array([])
    moving_costs = np.array([])

    done_looping = False
    epoch = 0

    best_validation_score = -np.inf
    best_cost = np.inf

    validation_improved_in = 0
    cost_improved_in = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        costs_epoch = np.array([])
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index, learning_rate)

            costs_epoch = np.hstack([costs_epoch, minibatch_avg_cost])

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

        W1 = classifier.params[0].eval()
        b1 = classifier.params[1].eval()

        W2 = classifier.params[2].eval()
        b2 = classifier.params[3].eval()

        W3 = classifier.params[4].eval()
        b3 = classifier.params[5].eval()

        # Evaluate the model on current eopch
        _, _, _, _, f1, prec, rec = test_mlp(test_labels, test_samples,
                                             (W1, b1, W2, b2, W3, b3))

        curr_f1 = np.mean(f1)

        if np.isnan(curr_f1):
            best_validation_score = 0
            break

        W = classifier.params[0].eval()
        if np.isnan(np.sum(W)):
            best_validation_score = 0
            break

        validation_scores = np.hstack([validation_scores, curr_f1])

        epoch_cost = np.mean(costs_epoch)
        costs = np.hstack([costs, epoch_cost])

        if (epoch <= 10):
            # print 'Epoch - %d, cost - %f, F1 - %f' % (epoch, epoch_cost, curr_f1)
            moving_costs = np.hstack([moving_costs, epoch_cost])
            moving_scores = np.hstack([moving_scores, curr_f1])

        else:

            moving_costs = np.hstack([moving_costs, np.mean(costs[-10:])])
            moving_scores = np.hstack(
                [moving_scores,
                 np.mean(validation_scores[-10:])])

            if moving_costs[-1] < best_cost:
                best_cost = moving_costs[-1]
                cost_improved_in = 0
            else:
                cost_improved_in += 1

            if moving_scores[-1] > best_validation_score:

                W1_best = classifier.params[0].eval()
                b1_best = classifier.params[1].eval()

                W2_best = classifier.params[2].eval()
                b2_best = classifier.params[3].eval()

                W3_best = classifier.params[4].eval()
                b3_best = classifier.params[5].eval()

                best_validation_score = moving_scores[-1]
                score_improved_in = 0
                validation_improved_in = 0
            else:
                score_improved_in += 1
                validation_improved_in += 1

            if score_improved_in > 10:
                print 'Rate reduced'
                learning_rate /= 1.5
                score_improved_in = 0

            # If the score has not improved in some time terminate early
            if validation_improved_in > 60:
                print 'Early termination'
                break

            print 'Epoch - %d, cost - %f (%f, %d), F1 - %f (%f, %d)' % \
               (epoch, epoch_cost, moving_costs[-1], cost_improved_in, curr_f1, moving_scores[-1], score_improved_in)

        # if(epoch > 10)
        #    costs

    end_time = time.clock()
    print 'Optimization complete with best validation score of %f ' % np.max(
        validation_scores)
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))

    #plot(moving_costs)
    #plot(moving_scores)
    #draw()
    #show()

    return (W1_best, b1_best, W2_best, b2_best, W3_best, b3_best)
code and theano.py"""

#### Libraries
# Third Party Libraries
import matplotlib.pyplot as plt
import numpy as np
import theano
import theano.tensor as T

rng = np.random.RandomState(23455)

# Paramater iniitilization
# Symbolic variable
X = T.matrix(name='X', dtype=theano.config.floatX)
y = T.vector(name='y', dtype=theano.config.floatX)
lr = T.scalar(name='learn_rate', dtype=theano.config.floatX)

# Variables that will be updated, hence are declared as `theano.share`
theta = theano.shared(name='theta',
                      value=rng.uniform(-1.0, 1.0,
                                        size=(3)).astype(theano.config.floatX))
bias = theano.shared(name='bias',
                     value=rng.uniform(13, 17,
                                       size=(1,
                                             1)).astype(theano.config.floatX),
                     broadcastable=(True, True))

# ADAM Parameters
beta1 = T.scalar(name='beta1', dtype=theano.config.floatX)
beta2 = T.scalar(name='beta2', dtype=theano.config.floatX)
eps = T.scalar(name='eps', dtype=theano.config.floatX)
Example #46
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=100,  # the number of GRU units
        encoder='tree_lstm',  # encoder model
        decoder='tree_lstm',  # decoder model 
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words=100000,  # vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='adadelta',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        dispFreq=100,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        use_dropout=False,
        reload_=False,
        verbose=False,  # print verbose information for debug but slow speed
        datasets=[],
        valid_datasets=[],
        test_datasets=[],
        dictionary='',
        embedding='',  # pretrain embedding file, such as word2vec, GLOVE
):

    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")
    # Model options
    model_options = locals().copy()

    # load dictionary and invert them
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # reload options
    if reload_ and os.path.exists(saveto):
        print 'Reload options'
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    logger.debug(pprint.pformat(model_options))

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         datasets[2],
                         dictionary,
                         n_words=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    train_valid = TextIterator(datasets[0],
                               datasets[1],
                               datasets[2],
                               dictionary,
                               n_words=n_words,
                               batch_size=valid_batch_size,
                               shuffle=False)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         valid_datasets[2],
                         dictionary,
                         n_words=n_words,
                         batch_size=valid_batch_size,
                         shuffle=False)
    test = TextIterator(test_datasets[0],
                        test_datasets[1],
                        test_datasets[2],
                        dictionary,
                        n_words=n_words,
                        batch_size=valid_batch_size,
                        shuffle=False)

    # Initialize (or reload) the parameters using 'model_options'
    # then build the Theano graph
    print 'Building model'
    params = init_params(model_options, worddicts)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        print 'Reload parameters'
        params = load_params(saveto, params)

    # numpy arrays -> theano shared variables
    tparams = init_tparams(params)

    trng, use_noise, \
        x1, x1_mask, x1_left_mask, x1_right_mask, \
        x2, x2_mask, x2_left_mask, x2_right_mask, \
        y, \
        opt_ret, \
        cost, \
        f_pred, f_prods = \
        build_model(tparams, model_options)
    inps = [x1, x1_mask, x1_left_mask, x1_right_mask, \
            x2, x2_mask, x2_left_mask, x2_right_mask, \
            y]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads
        if verbose:
            print 'Building function of gradient\'s norm'
            f_norm_g = theano.function(inps, tensor.sqrt(g2))

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        print 'Reload history error'
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_counter = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    valid_acc_record = []
    test_acc_record = []
    best_epoch_num = 0
    lr_change_list = []
    wait_counter = 0
    wait_N = 1

    for eidx in xrange(max_epochs):
        n_samples = 0
        for x1, x2, y in train:
            n_samples += len(x1)
            uidx += 1
            use_noise.set_value(1.)
            x1, x2, y = prepare_data(x1, x2, y)

            inps = [x1[0], x1[1], x1[2], x1[3], x2[0], x2[1], x2[2], x2[3], y]

            if x1 is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(*inps)
            if verbose:
                if clip_c > 0.:
                    norm_g = f_norm_g(*inps)

            # do the update on parameters
            f_update(lrate)
            ud = time.time() - ud_start
            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return None

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format(
                    eidx, uidx, cost, ud))
                if verbose:
                    if clip_c > 0.:
                        logger.debug('Grad {0}'.format(norm_g))

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_cost = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid).mean()
                valid_acc = pred_acc(f_pred, prepare_data, model_options,
                                     valid)
                valid_err = 1.0 - valid_acc
                history_errs.append(valid_err)
                test_cost = pred_probs(f_log_probs, prepare_data,
                                       model_options, test).mean()
                test_acc = pred_acc(f_pred, prepare_data, model_options, test)

                print 'Valid cost', valid_cost
                print 'Valid accuracy', valid_acc
                print 'Test cost', test_cost
                print 'Test accuracy', test_acc
                print 'lrate:', lrate

                valid_acc_record.append(valid_acc)
                test_acc_record.append(test_acc)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    best_epoch_num = eidx
                    wait_counter = 0

                if valid_err > numpy.array(history_errs).min():
                    wait_counter += 1

                if wait_counter >= wait_N:
                    print 'wait_counter max, need to half the lr'
                    bad_counter += 1
                    wait_counter = 0
                    print 'bad_counter: ' + str(bad_counter)
                    lrate = lrate * 0.5
                    lr_change_list.append(eidx)
                    print 'lrate change to: ' + str(lrate)
                    zipp(best_p, tparams)

                if bad_counter > patience:
                    print 'Early Stop!'
                    estop = True
                    break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    with open('record.csv', 'w') as f:
        f.write(str(best_epoch_num) + '\n')
        f.write(','.join(map(str, lr_change_list)) + '\n')
        f.write(','.join(map(str, valid_acc_record)) + '\n')
        f.write(','.join(map(str, test_acc_record)) + '\n')

    use_noise.set_value(0.)

    print '=' * 80
    print 'Final Result'
    print '=' * 80
    train_cost = pred_probs(f_log_probs, prepare_data, model_options,
                            train_valid).mean()
    train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid)
    print 'Train cost', train_cost
    print 'Train accuracy', train_acc
    valid_cost = pred_probs(f_log_probs, prepare_data, model_options,
                            valid).mean()
    valid_acc = pred_acc(f_pred, prepare_data, model_options, valid)
    print 'Valid cost', valid_cost
    print 'Valid accuracy', valid_acc
    test_cost = pred_probs(f_log_probs, prepare_data, model_options,
                           test).mean()
    test_acc = pred_acc(f_pred, prepare_data, model_options, test)
    print 'Test cost', test_cost
    print 'Test accuracy', test_acc
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)
    logger.debug('Done')

    return None
def make_scalar():
    """
    Returns a new Theano scalar.
    """

    return T.scalar()
Example #48
0
def build_nlm_model(rng,
                    model_params,
                    self_norm_coeff,
                    act_func,
                    dropout,
                    is_test=0):
    """
  Adapt from this tutorial http://deeplearning.net/tutorial/mlp.html 
  """
    # symbolic variables
    x = T.matrix('x')
    y = T.ivector(
        'y'
    )  # GPU stores values in float32, so now we have to convert to int32
    lr = T.scalar('lr')

    # classsifier
    if act_func == 'tanh':
        sys.stderr.write('# act_func=tanh\n')
        activation = T.tanh
    elif act_func == 'relu':
        sys.stderr.write('# act_func=rectifier\n')
        activation = rectifier
    elif act_func == 'leakyrelu':
        sys.stderr.write('# act_func=leaky rectifier\n')
        activation = leaky_rect
    else:
        sys.stderr.write(
            '! Unknown activation function %s, not tanh or relu\n' %
            (act_func))
        sys.exit(1)

    sys.stderr.write('# self_norm_coeff=%f\n' % self_norm_coeff)

    classifier = NLM(rng, x, model_params, self_norm_coeff, activation,
                     dropout, is_test)

    if is_test == 1:
        return (classifier, x, y)

    # cost
    cost = classifier.nll(y)
    if self_norm_coeff > 0:
        cost = cost + self_norm_coeff * classifier.mean_square_log_norm
        mean_abs_log_norm = classifier.mean_abs_log_norm

    # grad
    gparams = []
    #clip_range = 0.1
    grad_norm = 0.0
    for param in classifier.params:
        gparam = T.grad(cost, param)
        grad_norm += (gparam**2).sum()
        #gparam = T.clip(T.grad(cost, param), -clip_range, clip_range) # clip gradients
        gparams.append(gparam)
    grad_norm = T.sqrt(grad_norm)

    # grad norm is small overall
    #max_grad_norm = 5
    #if T.gt(grad_norm, max_grad_norm):
    #  lr = lr * max_grad_norm / grad_norm

    # update
    updates = []
    for param, gparam in zip(classifier.params, gparams):
        updates.append((param, param - lr * gparam))

    if self_norm_coeff > 0:
        return (classifier, x, y, lr, cost, grad_norm, mean_abs_log_norm,
                updates)
    else:
        return (classifier, x, y, lr, cost, grad_norm, updates)
Example #49
0
                            which_sources=('sp', ))
data_stream = ScaleAndShift(data_stream,
                            scale=1 / f0_std,
                            shift=-f0_mean / f0_std,
                            which_sources=('f0', ))
data_stream = Mapping(data_stream, _zero_for_unvoiced)
data_stream = Mapping(data_stream, _transpose)
data_stream = SegmentSequence(data_stream, 8 * seq_size, add_flag=True)
data_stream = ForceFloatX(data_stream)
valid_stream = data_stream

#################
# Model
#################

start_flag = tensor.scalar('start_flag')
x = tensor.tensor3('sp')
#x = tensor.tensor3('features')

f0 = tensor.matrix('f0')
voiced = tensor.matrix('voiced')

f0s = f0.dimshuffle(0, 1, 'x')
voiceds = voiced.dimshuffle(0, 1, 'x')

context = tensor.concatenate([f0s, voiceds], 2)

activations_x = [Rectifier()] * depth_x

dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
         [hidden_size_recurrent]
Example #50
0
    def __init__(self,
                 objective,
                 params,
                 inputs=None,
                 param_constrainers=None,
                 max_iter=-1,
                 lr_scalers=None,
                 verbose=0,
                 tol=None,
                 init_alpha=None,
                 min_init_alpha=1e-3,
                 reset_alpha=True,
                 conjugate=False,
                 reset_conjugate=True,
                 gradients=None,
                 gradient_updates=None,
                 line_search_mode=None,
                 accumulate=False,
                 theano_function_mode=None):

        self.__dict__.update(locals())
        del self.self

        if line_search_mode is None:
            if init_alpha is None:
                init_alpha = (.001, .005, .01, .05, .1)
        else:
            assert line_search_mode == 'exhaustive'
            if init_alpha is None:
                init_alpha = (.5, 1.)

        self.init_alpha = tuple([float(elem) for elem in init_alpha])

        if inputs is None:
            inputs = []

        if param_constrainers is None:
            param_constrainers = []

        obj = objective

        self.verbose = verbose

        # TODO: remove verbose statements (handled by logging)
        if self.verbose > 0:
            logger.setLevel(logging.DEBUG)

        param_to_grad_sym = OrderedDict()
        param_to_grad_shared = OrderedDict()
        updates = OrderedDict()
        if self.gradient_updates is not None:
            updates.update(self.gradient_updates)

        self.params = [param for param in params]

        for param in params:
            if self.gradients is not None and param in self.gradients:
                g = self.gradients[param]
            else:
                g = grad(objective, param)
            param_to_grad_sym[param] = g
            if param.name is not None:
                param_name = param.name
            else:
                param_name = 'anon_param'
            grad_name = 'BatchGradientDescent.grad_' + param_name
            grad_shared = sharedX(param.get_value() * 0., name=grad_name)
            param_to_grad_shared[param] = grad_shared
            updates[grad_shared] = g

        self.param_to_grad_shared = param_to_grad_shared

        if self.verbose:
            logger.debug('batch gradient class compiling gradient function')
        t1 = time.time()
        if self.accumulate:
            self._compute_grad = Accumulator(inputs, updates=updates)
        else:
            self._compute_grad = function(
                inputs,
                updates=updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._compute_grad')
        if self.verbose:
            t2 = time.time()
            logger.debug('done. Took {0}'.format(t2 - t1))

        if self.verbose:
            logger.debug('batch gradient class compiling objective function')
        if self.accumulate:
            self.obj = Accumulator(inputs, obj)
        else:
            self.obj = function(inputs,
                                obj,
                                mode=self.theano_function_mode,
                                name='BatchGradientDescent.obj')

        if self.verbose:
            logger.debug('done')

        self.param_to_cache = OrderedDict()
        alpha = T.scalar(name='alpha')
        alpha.tag.test_value = np.cast[alpha.dtype](.01)
        cache_updates = OrderedDict()
        goto_updates = OrderedDict()
        for param in params:
            if param.name is None:
                param_name = 'anon_param'
            else:
                param_name = param.name
            cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name
            self.param_to_cache[param] = sharedX(param.get_value(borrow=False),
                                                 name=cache_name)
            cache_updates[self.param_to_cache[param]] = param
            cached = self.param_to_cache[param]
            g = self.param_to_grad_shared[param]
            if lr_scalers is not None and param in lr_scalers:
                scaled_alpha = alpha * lr_scalers[param]
            else:
                scaled_alpha = alpha
            mul = scaled_alpha * g
            diff = cached - mul
            goto_updates[param] = diff
        self._cache_values = function(
            [],
            updates=cache_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._cache_values')
        assert isinstance(param_constrainers, (list, tuple))
        for param_constrainer in param_constrainers:
            param_constrainer(goto_updates)
        self._goto_alpha = function([alpha],
                                    updates=goto_updates,
                                    mode=self.theano_function_mode,
                                    name='BatchGradientDescent._goto_alpha')

        norm = T.sqrt(
            sum([
                T.sqr(elem).sum()
                for elem in self.param_to_grad_shared.values()
            ]))
        norm.name = 'BatchGradientDescent.norm'
        normalize_grad_updates = OrderedDict()
        for grad_shared in self.param_to_grad_shared.values():
            normalize_grad_updates[grad_shared] = grad_shared / norm

        # useful for monitoring
        self.ave_grad_size = sharedX(0.)
        self.new_weight = sharedX(1.)
        normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (
            1. - self.new_weight) * self.ave_grad_size

        self._normalize_grad = function(
            [],
            norm,
            updates=normalize_grad_updates,
            mode=self.theano_function_mode,
            name='BatchGradientDescent._normalize_grad')

        if self.conjugate:
            grad_shared = self.param_to_grad_shared.values()

            grad_to_old_grad = OrderedDict()
            for elem in grad_shared:
                grad_to_old_grad[elem] = sharedX(elem.get_value(),
                                                 'old_' + elem.name)

            self._store_old_grad = function(
                [norm],
                updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm)
                                     for g_ in grad_to_old_grad]),
                mode=self.theano_function_mode,
                name='BatchGradientDescent._store_old_grad')

            grad_ordered = list(grad_to_old_grad.keys())
            old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered]

            def dot_product(x, y):
                return sum([(x_elem * y_elem).sum()
                            for x_elem, y_elem in safe_zip(x, y)])

            beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \
                    (1e-7+dot_product(old_grad_ordered, old_grad_ordered))
            assert beta_pr.ndim == 0

            beta = T.maximum(beta_pr, 0.)

            #beta_pr is the Polak-Ribiere formula for beta.
            #According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste"
            #but max(0, beta_pr) is "a popular choice... which provides direction reset automatically."
            #(ie, it is meant to revert to steepest descent when you have traveled far enough that
            #the objective function is behaving non-quadratically enough that the conjugate gradient
            #formulas aren't working anymore)

            #http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method

            assert grad not in grad_to_old_grad

            make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_])
                                      for g_ in grad_ordered]

            mode = self.theano_function_mode
            if mode is not None and hasattr(mode, 'record'):
                for v, u in make_conjugate_updates:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \
                            + var_descriptor(v) + '\n')
                    mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \
                            + var_descriptor(u) + '\n')

            self._make_conjugate = function(
                [],
                updates=make_conjugate_updates,
                mode=self.theano_function_mode,
                name='BatchGradientDescent._make_conjugate')

            if mode is not None and hasattr(mode, 'record'):
                for output in self._make_conjugate.maker.fgraph.outputs:
                    mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \
                            + var_descriptor(output) + '\n')

        if tol is None:
            if objective.dtype == "float32":
                self.tol = 1e-6
            else:
                self.tol = 3e-7
        else:
            self.tol = tol

        self.ave_step_size = sharedX(0.)
        self.ave_grad_mult = sharedX(0.)
Example #51
0
    def __init__(self,
            window_size,
            n_quadratic_filters,
            activation_function,
            reconstruction_cost_function,
            tie_weights=False,
#            _input,
#            _targ
            ):
        super(ConvolutionalMLP, self).__init__()

        #self.lr = module.Member(T.scalar())
        self.lr = (T.scalar())

        self.inputs = [T.dmatrix() for i in range(window_size)]
        self.targ = T.lvector()

        self.input_representations = []
        self.input_representations.append(QDAA(
                            input=self.inputs[0],
                            tie_weights=tie_weights,
                            n_quadratic_filters=n_quadratic_filters,
                            activation_function=activation_function,
                            reconstruction_cost_function = reconstruction_cost_function
                        )
        )

        for i in self.inputs[1:]:
            self.input_representations.append(
                            QDAA(
                                input=i,
                                tie_weights=tie_weights,
                                n_quadratic_filters=n_quadratic_filters,
                                activation_function=activation_function,
                                reconstruction_cost_function = reconstruction_cost_function,
                                _w1 = self.input_representations[0].w1,
                                _w2 = self.input_representations[0].w2,
                                _b1 = self.input_representations[0].b1,
                                _b2 = self.input_representations[0].b2,
                                _qfilters = self.input_representations[0].qfilters
                            )
            )
            assert self.input_representations[-1].w1 is \
                    self.input_representations[0].w1

        self.input_representation = T.concatenate([i.
            hidden for i in self.input_representations], axis=1)
        self.hidden = QDAA(
                        input=self.input_representation,
                        tie_weights=tie_weights,
                        n_quadratic_filters=n_quadratic_filters,
                        activation_function=activation_function,
                        reconstruction_cost_function = reconstruction_cost_function
                    )
        self.output = Module_Nclass(x=self.hidden.hidden, targ=self.targ)

        input_pretraining_params = [
                        self.input_representations[0].w1,
                        self.input_representations[0].w2,
                        self.input_representations[0].b1,
                        self.input_representations[0].b2
                        ] + self.input_representations[0].qfilters
        hidden_pretraining_params = [
                        self.hidden.w1,
                        self.hidden.w2,
                        self.hidden.b1,
                        self.hidden.b2
                        ] + self.hidden.qfilters
        input_pretraining_cost = sum(i.ncost for i in self.
            input_representations)
        hidden_pretraining_cost = self.hidden.ncost
        input_pretraining_gradients = T.grad(input_pretraining_cost,
                input_pretraining_params)
        hidden_pretraining_gradients = T.grad(
            hidden_pretraining_cost, hidden_pretraining_params)
        pretraining_updates = \
                dict((p, p - self.lr * g) for p, g in \
                zip(input_pretraining_params, input_pretraining_gradients) \
                + zip(hidden_pretraining_params, hidden_pretraining_gradients))

        self.pretraining_update = module.Method(self.inputs,
                [input_pretraining_cost, hidden_pretraining_cost],
                pretraining_updates)

        finetuning_params = \
                        [self.input_representations[0].w1, self.input_representations[0].b1] + self.input_representations[0].qfilters + \
                        [self.hidden.w1, self.hidden.b1] + self.hidden.qfilters + \
                        [self.output.w, self.output.b]
        finetuning_cost = self.output.cost
        finetuning_gradients = T.grad(finetuning_cost, finetuning_params)
        finetuning_updates = dict((p, p - self.lr * g) for p,
             g in zip(finetuning_params, finetuning_gradients))
        self.finetuning_update = module.Method(self.inputs + [self.
            targ], self.output.cost, finetuning_updates)
Example #52
0
    #           = arg_i - log sum_j exp(arg_j)
    return example_costs.mean()


confidence = ymf1_arg - ((1 - yb) * ymf1_arg).max(axis=1).dimshuffle(0, 'x')
misclass_cost = -(confidence * yb).sum(axis=1).mean()


mf1_cost = - log_p_yb ( ymf1_arg) + \
             l1wd * T.sqr(mf1mod.W1).sum() +\
             l2wd * T.sqr(mf1mod.W2).sum() +\
             l3wd * T.sqr(mf1mod.W3).sum()

updates = {}

alpha = T.scalar()
alpha.tag.test_value = 1e-4

tv = T.scalar()
momentum = 1. - 1. / tv

for cost, params in [(mf1_cost, mf1mod.params())]:
    for param in params:
        inc = sharedX(np.zeros(param.get_value().shape))
        updates[inc] = momentum * inc - alpha * T.grad(cost, param)
        updates[param] = param + updates[inc]

from theano import function

func = function([idx, alpha, tv], [mf1_cost], updates=updates)
# test value
x = np.eye(1000, dtype=theano.config.floatX)

tic = time.time()
A = compute_norm_lines(x)
print('It took %f seconds' % (time.time() - tic))

# comparison with numpy
tic = time.time()
B = np.sqrt((x**2).sum(1))
print('It took %f seconds' % (time.time() - tic))

print '-' * 50

coefficients = theano.tensor.vector("coefficients")
x = T.scalar("x")

max_coefficients_supported = 10000

# Generate the components of the polynomial
components, updates = theano.scan(
    fn=lambda coefficient, power, free_variable: coefficient *
    (free_variable**power),
    sequences=[coefficients,
               theano.tensor.arange(max_coefficients_supported)],
    non_sequences=x)

# Sum them up
polynomial = components.sum()

# Compile a function
Example #54
0
    def __init__(self,
            input=None,
#            regularize = False,
            tie_weights=False,
            n_quadratic_filters=1,
            _w1=None,
            _w2=None,
            _b1=None,
            _b2=None,
            _qfilters=None,
            activation_function=NN.sigmoid,
            reconstruction_cost_function=cross_entropy):
        """
        :param input: WRITEME

        :param regularize: WRITEME

        :param tie_weights: WRITEME

        :param activation_function: WRITEME

        :param reconstruction_cost: Should return one cost per example (row)

        :todo: Default noise level for all daa levels

        """
        super(QuadraticDenoisingAA, self).__init__()

        self.random = T.RandomStreams()

        # MODEL CONFIGURATION
#        self.regularize = regularize
        self.tie_weights = tie_weights
        self.activation_function = activation_function
        self.reconstruction_cost_function = reconstruction_cost_function

        # ACQUIRE/MAKE INPUT
        if not input:
            input = T.matrix('input')
        #self.input = theano.External(input)
        self.input = (input)

        # HYPER-PARAMETERS
        #self.lr = theano.Member(T.scalar())
        self.lr = (T.scalar())

        # PARAMETERS
        if _qfilters is None:
            #self.qfilters = [theano.Member(T.dmatrix('q%i'%i)) for i in xrange(n_quadratic_filters)]
            self.qfilters = [(T.dmatrix('q%i' % i))
                 for i in xrange(n_quadratic_filters)]
        else:
            #self.qfilters = [theano.Member(q) for q in _qfilters]
            self.qfilters = [(q) for q in _qfilters]

        #self.w1 = theano.Member(T.matrix('w1')) if _w1 is None else theano.Member(_w1)
        if _w1 is None:
            self.w1 = (T.matrix('w1'))
        else:
            self.w1 = (_w1)
        if _w2 is None:
            if not tie_weights:
                #self.w2 = theano.Member(T.matrix())
                self.w2 = (T.matrix())
            else:
                self.w2 = self.w1.T
        else:
            #self.w2 = theano.Member(_w2)
            self.w2 = (_w2)
        #self.b1 = theano.Member(T.vector('b1')) if _b1 is None else theano.Member(_b1)
        if _b1 is None:
            self.b1 = (T.vector('b1'))
        else:
            self.b1 = (_b1)
        #self.b2 = theano.Member(T.vector('b2')) if _b2 is None else theano.Member(_b2)
        if _b2 is None:
            self.b2 = (T.vector('b2'))
        else:
            self.b2 = (_b2)

#        # REGULARIZATION COST
#        self.regularization = self.build_regularization()

        ### NOISELESS ###
        # HIDDEN LAYER
        def _act(x):
            if len(self.qfilters) > 0:
                qsum = 10e-10   # helps to control the gradient in the square-root below
                for qf in self.qfilters:
                    qsum = qsum + T.dot(x, qf) ** 2

                return T.dot(x, self.w1) + self.b1 + T.sqrt(qsum)
            else:
                return T.dot(x, self.w1) + self.b1

        self.hidden_activation = _act(self.input)  # noise-free hidden

        self.hidden = self.hid_activation_function(self.hidden_activation)

        # RECONSTRUCTION LAYER
        self.output_activation = T.dot(self.hidden, self.w2) + self.b2
        self.output = self.out_activation_function(self.output_activation)

        # RECONSTRUCTION COST
        self.reconstruction_costs = self.build_reconstruction_costs(self.output)
        self.reconstruction_cost = T.mean(self.reconstruction_costs)

        # TOTAL COST
        self.cost = self.reconstruction_cost
#        if self.regularize:
#            self.cost = self.cost + self.regularization

        ### WITH NOISE ###
        self.corrupted_input = self.build_corrupted_input()

        # HIDDEN LAYER
        self.nhidden_activation = _act(self.corrupted_input)
        self.nhidden = self.hid_activation_function(self.nhidden_activation)

        # RECONSTRUCTION LAYER
        self.noutput_activation = T.dot(self.nhidden, self.w2) + self.b2
        self.noutput = self.out_activation_function(self.noutput_activation)

        # RECONSTRUCTION COST
        self.nreconstruction_costs = self.build_reconstruction_costs(self.noutput)
        self.nreconstruction_cost = T.mean(self.nreconstruction_costs)

        # TOTAL COST
        self.ncost = self.nreconstruction_cost
#        if self.regularize:
#            self.ncost = self.ncost + self.regularization

        # GRADIENTS AND UPDATES
        if self.tie_weights:
            self.params = [self.w1, self.b1, self.b2] + self.qfilters
        else:
            self.params = [self.w1, self.w2, self.b1, self.b2] + self.qfilters

        gradients = T.grad(self.ncost, self.params)
        updates = dict((p, p - self.lr * g) for p, g in zip(self.
            params, gradients))
Example #55
0
    def build(
            self,
            initial_stepsize,
            n_steps,
            target_acceptance_rate=.65,
            stepsize_dec=0.98,
            stepsize_min=0.0001,
            stepsize_max=0.5,
            stepsize_inc=1.02,
            # used in geometric avg. 1.0 would be not moving at all
            avg_acceptance_slowness=0.9,
            seed=12345,
            init_state=None):

        if init_state is None:
            init_h = np.random.normal(
                0, 1, size=[self.n_sam * self.batch_size,
                            self.hdim]).astype(np.float32)
        else:
            init_h = init_state
            print('load init_state')
        init_m = np.random.randn(self.n_sam * self.batch_size,
                                 self.hdim).astype(np.float32)

        # For HMC
        # h denotes current states
        self.h = sharedX(init_h)
        # m denotes momentum
        t = T.scalar()
        self.generated = self.generate(self.h)
        lld = T.reshape(-self.energy_fn(self.h), [self.n_sam, self.batch_size])
        self.eval_lld = theano.function([t],
                                        lld,
                                        givens={
                                            self.obs: self.obs_val,
                                            self.t: t
                                        })

        # allocate shared variables
        stepsize = sharedX(initial_stepsize)
        avg_acceptance_rate = sharedX(target_acceptance_rate)
        s_rng = TT.shared_randomstreams.RandomStreams(seed)

        # define graph for an `n_steps` HMC simulation
        accept, final_pos = hmc_move(s_rng, self.h, self.energy_fn, stepsize,
                                     n_steps)

        # define the dictionary of updates, to apply on every `simulate` call
        simulate_updates = hmc_updates(
            self.h,
            stepsize,
            avg_acceptance_rate,
            final_pos=final_pos,
            accept=accept,
            stepsize_min=stepsize_min,
            stepsize_max=stepsize_max,
            stepsize_inc=stepsize_inc,
            stepsize_dec=stepsize_dec,
            target_acceptance_rate=target_acceptance_rate,
            avg_acceptance_slowness=avg_acceptance_slowness)

        self.step = theano.function([t], [accept],
                                    updates=simulate_updates,
                                    givens={
                                        self.obs: self.obs_val,
                                        self.t: t
                                    })
Example #56
0
 def build_corrupted_input(self):
     #self.noise_level = theano.Member(T.scalar())
     self.noise_level = (T.scalar())
     return self.random.binomial(T.shape(self.input), 1, 1 - self.noise_level) * self.input
    def __init__(self,
                 n_x,
                 n_a,
                 n_z,
                 n_y,
                 qa_hid,
                 qz_hid,
                 qy_hid,
                 px_hid,
                 pa_hid,
                 nonlinearity=rectify,
                 px_nonlinearity=None,
                 x_dist='bernoulli',
                 batchnorm=False,
                 seed=1234):
        """
        Initialize an skip deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(a|z,y) and p(x|a,z,y),
        inference model Q q(a|x) and q(z|a,x,y).
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param qa_hid: List of number of deterministic hidden q(a|x).
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param qy_hid: List of number of deterministic hidden q(y|a,x).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(SDGM, self).__init__(n_x, qz_hid + px_hid, n_a + n_z,
                                   nonlinearity)
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_x = n_x
        self.n_a = n_a
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_beta = T.scalar('beta')  # scaling constant beta
        self.sym_x_l = T.matrix('x')  # labeled inputs
        self.sym_t_l = T.matrix('t')  # labeled targets
        self.sym_x_u = T.matrix('x')  # unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # number of labeled data
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_z = T.matrix('z')  # latent variable z
        self.sym_a = T.matrix('a')  # auxiliary variable a

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w),
                               None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w),
                            init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w),
                                init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        # Input layers
        l_x_in = InputLayer((None, n_x))
        l_y_in = InputLayer((None, n_y))

        # Auxiliary q(a|x)
        l_qa_x = l_x_in
        for hid in qa_hid:
            l_qa_x = dense_layer(l_qa_x, hid)
        l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(
            l_qa_x, n_a, self.sym_samples)

        # Classifier q(y|a,x)
        l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qy = ReshapeLayer(l_qa_to_qy,
                                  (-1, self.sym_samples, 1, qy_hid[0]))
        l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1))
        l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]),
                               (-1, qy_hid[0]))
        if batchnorm:
            l_qy_xa = BatchNormLayer(l_qy_xa)
        l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf)
        if len(qy_hid) > 1:
            for hid in qy_hid[1:]:
                l_qy_xa = dense_layer(l_qy_xa, hid)
        l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(),
                             init.Normal(init_w), softmax)

        # Recognition q(z|x,a,y)
        l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qz = ReshapeLayer(l_qa_to_qz,
                                  (-1, self.sym_samples, 1, qz_hid[0]))
        l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1))
        l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1))
        l_qz_axy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]),
            (-1, qz_hid[0]))
        if batchnorm:
            l_qz_axy = BatchNormLayer(l_qz_axy)
        l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf)
        if len(qz_hid) > 1:
            for hid in qz_hid[1:]:
                l_qz_axy = dense_layer(l_qz_axy, hid)
        l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(
            l_qz_axy, n_z, 1)

        # Generative p(a|z,y)
        l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1))
        l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_pa = ReshapeLayer(l_qz_to_pa,
                                  (-1, self.sym_samples, 1, pa_hid[0]))
        l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]),
                               [-1, pa_hid[0]])
        if batchnorm:
            l_pa_zy = BatchNormLayer(l_pa_zy)
        l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf)
        if len(pa_hid) > 1:
            for hid in pa_hid[1:]:
                l_pa_zy = dense_layer(l_pa_zy, hid)
        l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1)

        # Generative p(x|a,z,y)
        l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_px = ReshapeLayer(l_qa_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1))
        l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_px = ReshapeLayer(l_qz_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_px_azy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]),
            [-1, px_hid[0]])
        if batchnorm:
            l_px_azy = BatchNormLayer(l_px_azy)
        l_px_azy = NonlinearityLayer(l_px_azy, self.transf)
        if len(px_hid) > 1:
            for hid in px_hid[1:]:
                l_px_azy = dense_layer(l_px_azy, hid)

        if x_dist == 'bernoulli':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(),
                                  init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(),
                                  init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(
                l_px_azy, n_x, 1, px_nonlinearity)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_in = l_qa_x

        self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a))
        self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1))
        self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1))

        self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = ReshapeLayer(l_qz_axy_mu,
                                    (-1, self.sym_samples, 1, n_z))
        self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar,
                                        (-1, self.sym_samples, 1, n_z))

        self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y))

        self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a))
        self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a))
        self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar,
                                        (-1, self.sym_samples, 1, n_a))

        self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x))
        self.l_px_mu = ReshapeLayer(l_px_zy_mu,
                                    (-1, self.sym_samples, 1,
                                     n_x)) if x_dist == "gaussian" else None
        self.l_px_logvar = ReshapeLayer(
            l_px_zy_logvar,
            (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None

        # Predefined functions
        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qy, self.sym_x_l,
                             deterministic=True).mean(axis=(1, 2))
        self.f_qy = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qa, self.sym_x_l,
                             deterministic=True).mean(axis=(1, 2))
        self.f_qa = theano.function(inputs, outputs)

        inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_pa, inputs, deterministic=True)
        self.f_pa = theano.function(
            [self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        inputs = {
            l_qa_x: self.sym_a,
            l_qz_axy: self.sym_z,
            l_y_in: self.sym_t_l
        }
        outputs = get_output(self.l_px, inputs, deterministic=True)
        self.f_px = theano.function(
            [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px])
        self.trainable_model_params = get_all_params(
            [self.l_qy, self.l_pa, self.l_px], trainable=True)
    def __theano_build__(self):
        E, V, U, W, b, c, W_att, b_att = self.E, self.V, self.U, self.W, self.b , self.c, self.W_att, self.b_att

        x_a = T.ivector('x_a')
        x_b = T.ivector('x_b')
        y = T.lvector('y')

        def forward_direction_step(x_t,s_t_prev):
            # Word embedding layer
            x_e = E[:,x_t]
            # GRU layer 1
            z_t = T.nnet.hard_sigmoid(U[0].dot(x_e)+W[0].dot(s_t_prev)) + b[0]
            r_t = T.nnet.hard_sigmoid(U[1].dot(x_e)+W[1].dot(s_t_prev)) + b[1]
            c_t = T.tanh(U[2].dot(x_e)+W[2].dot(s_t_prev*r_t)+b[2])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t*s_t_prev
            # directly return the hidden state as intermidate output 
            return [s_t]
        
        
        def backward_direction_step(x_t,s_t_prev):
            # Word embedding layer
            x_e = E[:,x_t]
            # GRU layer 2
            z_t = T.nnet.hard_sigmoid(U[3].dot(x_e)+W[3].dot(s_t_prev)) + b[3]
            r_t = T.nnet.hard_sigmoid(U[4].dot(x_e)+W[4].dot(s_t_prev)) + b[4]
            c_t = T.tanh(U[5].dot(x_e)+W[5].dot(s_t_prev*r_t)+b[5])
            s_t = (T.ones_like(z_t) - z_t) * c_t + z_t*s_t_prev
            # directly return the hidden state as intermidate output 
            return [s_t]



        # sentence a vector (states) forward direction 
        a_s_f , updates = theano.scan(
                forward_direction_step,
                sequences=x_a,
                truncate_gradient=self.bptt_truncate,
                outputs_info=T.zeros(self.hidden_dim))

        # sentence b vector (states) backward direction
        a_s_b , updates = theano.scan(
                backward_direction_step,
                sequences=x_a[::-1],
                truncate_gradient=self.bptt_truncate,
                outputs_info=T.zeros(self.hidden_dim))
            
        # sentence b vector (states) forward direction 
        b_s_f , updates = theano.scan(
                forward_direction_step,
                sequences=x_b,
                truncate_gradient=self.bptt_truncate,
                outputs_info=T.zeros(self.hidden_dim))
        
        # sentence b vector (states) backward direction 
        b_s_b , updates = theano.scan(
                backward_direction_step,
                sequences=x_b[::-1],
                truncate_gradient=self.bptt_truncate,
                outputs_info=T.zeros(self.hidden_dim))


        # combine the sena
        a_s = T.concatenate([a_s_f,a_s_b[::-1]],axis=1)
        b_s = T.concatenate([b_s_f,b_s_b[::-1]],axis=1)

        def soft_attention(h_i):
            return T.tanh(W_att.dot(h_i)+b_att)
        
        def weight_attention(h_i,a_j):
            return h_i*a_j

        a_att, updates = theano.scan(
                soft_attention,
                sequences=a_s
                )
        b_att, updates = theano.scan(
                soft_attention,
                sequences=b_s
                )

        # softmax
        # a_att = (59,1)
        # b_att = (58,1)
        a_att = T.exp(a_att)
        a_att = a_att.flatten()
        a_att = a_att / a_att.sum()

        b_att = T.exp(b_att)
        b_att = b_att.flatten()
        b_att = b_att / b_att.sum()

        a_s_att,updates = theano.scan(
                weight_attention,
                sequences=[a_s,a_att]
                )
        b_s_att,updates = theano.scan(
                weight_attention,
                sequences=[b_s,b_att]
                )
        # eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX)

        # semantic similarity 
        # s_sim = manhattan_distance(a_s[-1],b_s[-1])

        # for classification using simple strategy
        # for now we still use the last word vector as sentence vector
        # apply a simple single hidden layer on each word in sentence 
        # 
        # a (wi) = attention(wi) = tanh(w_att.dot(wi)+b)
        # theano scan 
        # exp(a)
        # 
        sena = a_s_att.sum(axis=0)
        senb = b_s_att.sum(axis=0)

        combined_s = T.concatenate([sena,senb],axis=0)




        # softmax class
        o = T.nnet.softmax(V.dot(combined_s)+c)[0]

        # in case the o contains 0 which cause inf and nan
        eps = np.asarray([1.0e-10]*self.label_dim,dtype=theano.config.floatX)
        o = o + eps
        om = o.reshape((1,o.shape[0]))
        prediction = T.argmax(om,axis=1)
        o_error = T.nnet.categorical_crossentropy(om,y)


        # cost 
        cost = T.sum(o_error)

        # updates
        updates = sgd_updates_adadelta(norm=0,params=self.params,cost=cost)

        # monitor parameter
        mV = V * T.ones_like(V)
        mc = c * T.ones_like(c)
        mU = U * T.ones_like(U)
        mW = W * T.ones_like(W)

        gV = T.grad(cost,V)
        gc = T.grad(cost,c)
        gU = T.grad(cost,U)
        gW = T.grad(cost,W)

        mgV = gV * T.ones_like(gV)
        mgc = gc * T.ones_like(gc)
        mgU = gU * T.ones_like(gU)
        mgW = gW * T.ones_like(gW)




        # Assign functions
        self.comsen = theano.function([x_a,x_b],[a_att,b_att])
        self.monitor = theano.function([x_a,x_b],[sena,senb,mV,mc,mU,mW])
        self.monitor_grad = theano.function([x_a,x_b,y],[mgV,mgc,mgU,mgW])
        self.predict = theano.function([x_a,x_b],om)
        self.predict_class = theano.function([x_a,x_b],prediction)
        self.ce_error = theano.function([x_a,x_b,y],cost)
        # self.bptt = theano.function([x,y],[dE,dU,dW,db,dV,dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        # find the nan
        self.sgd_step = theano.function(
                [x_a,x_b,y],
                [],
                updates=updates
                # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
                )
    def fit(self,
            X_train,
            Y_train,
            X_test=None,
            Y_test=None,
            validate_every=100,
            optimizer='sgd',
            compute_zero_one=False,
            show_norms=True,
            show_output=True):
        """ Fit model

        Pass in X_test, Y_test to compute test error and report during
        training.

        X_train : ndarray (T x n_in)
        Y_train : ndarray (T x n_out)

        validation_frequency : int
            in terms of number of epochs

        optimizer : string
            Optimizer type.
            Possible values:
                'sgd'  : batch stochastic gradient descent
                'cg'   : nonlinear conjugate gradient algorithm
                         (scipy.optimize.fmin_cg)
                'bfgs' : quasi-Newton method of Broyden, Fletcher, Goldfarb,
                         and Shanno (scipy.optimize.fmin_bfgs)
                'l_bfgs_b' : Limited-memory BFGS (scipy.optimize.fmin_l_bfgs_b)

        compute_zero_one : bool
            in the case of binary output, compute zero-one error in addition to
            cross-entropy error
        show_norms : bool
            Show L2 norms of individual parameter groups while training.
        show_output : bool
            Show the model output on first training case while training.
        """
        if X_test is not None:
            assert (Y_test is not None)
            self.interactive = True
            test_set_x, test_set_y = self.shared_dataset((X_test, Y_test))
        else:
            self.interactive = False

        train_set_x, train_set_y = self.shared_dataset((X_train, Y_train))

        if compute_zero_one:
            assert(self.output_type == 'binary' \
                   or self.output_type == 'softmax')
        # compute number of minibatches for training
        # note that cases are the second dimension, not the first
        n_train = train_set_x.get_value(borrow=True).shape[1]
        n_train_batches = int(np.ceil(1.0 * n_train / self.batch_size))
        if self.interactive:
            n_test = test_set_x.get_value(borrow=True).shape[1]
            n_test_batches = int(np.ceil(1.0 * n_test / self.batch_size))

        #validate_every is specified in terms of epochs
        validation_frequency = validate_every * n_train_batches

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        logger.info('... building the model')

        index = T.lscalar('index')  # index to a [mini]batch
        n_ex = T.lscalar('n_ex')  # total number of examples
        # learning rate (may change)
        l_r = T.scalar('l_r', dtype=theano.config.floatX)
        mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum

        cost = self.rnn.loss(self.y) \
            + self.L1_reg * self.rnn.L1 \
            + self.L2_reg * self.rnn.L2_sqr

        # Proper implementation of variable-batch size evaluation
        # Note that classifier.errors() returns the mean error
        # But the last batch may be a smaller size
        # So we keep around the effective_batch_size (whose last element may
        # be smaller than the rest)
        # And weight the reported error by the batch_size when we average
        # Also, by keeping batch_start and batch_stop as symbolic variables,
        # we make the theano function easier to read
        batch_start = index * self.batch_size
        batch_stop = T.minimum(n_ex, (index + 1) * self.batch_size)
        effective_batch_size = batch_stop - batch_start

        get_batch_size = theano.function(inputs=[index, n_ex],
                                         outputs=effective_batch_size)

        compute_train_error = theano.function(
            inputs=[index, n_ex],
            outputs=self.rnn.loss(self.y),
            givens={
                self.x: train_set_x[:, batch_start:batch_stop],
                self.y: train_set_y[:, batch_start:batch_stop]
            },
            mode=mode)

        if compute_zero_one:
            compute_train_zo = theano.function(
                inputs=[index, n_ex],
                outputs=self.rnn.errors(self.y),
                givens={
                    self.x: train_set_x[:, batch_start:batch_stop],
                    self.y: train_set_y[:, batch_start:batch_stop]
                },
                mode=mode)

        if self.interactive:
            compute_test_error = theano.function(
                inputs=[index, n_ex],
                outputs=self.rnn.loss(self.y),
                givens={
                    self.x: test_set_x[:, batch_start:batch_stop],
                    self.y: test_set_y[:, batch_start:batch_stop]
                },
                mode=mode)

            if compute_zero_one:
                compute_test_zo = theano.function(
                    inputs=[index, n_ex],
                    outputs=self.rnn.errors(self.y),
                    givens={
                        self.x: test_set_x[:, batch_start:batch_stop],
                        self.y: test_set_y[:, batch_start:batch_stop]
                    },
                    mode=mode)

        self.get_norms = {}
        for param in self.rnn.params:
            self.get_norms[param] = theano.function(
                inputs=[], outputs=self.rnn.l2_norms[param], mode=mode)

        # compute the gradient of cost with respect to theta using BPTT
        gtheta = T.grad(cost, self.rnn.theta)

        if optimizer == 'sgd':

            updates = {}
            theta = self.rnn.theta
            theta_update = self.rnn.theta_update
            # careful here, update to the shared variable
            # cannot depend on an updated other shared variable
            # since updates happen in parallel
            # so we need to be explicit
            upd = mom * theta_update - l_r * gtheta
            updates[theta_update] = upd
            updates[theta] = theta + upd

            # compiling a Theano function `train_model` that returns the
            # cost, but in the same time updates the parameter of the
            # model based on the rules defined in `updates`
            train_model = theano.function(
                inputs=[index, n_ex, l_r, mom],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[:, batch_start:batch_stop],
                    self.y: train_set_y[:, batch_start:batch_stop]
                },
                mode=mode)

            ###############
            # TRAIN MODEL #
            ###############
            logger.info('... training')
            epoch = 0

            while (epoch < self.n_epochs):
                epoch = epoch + 1
                effective_momentum = self.final_momentum \
                                     if epoch > self.momentum_switchover \
                                     else self.initial_momentum

                for minibatch_idx in xrange(n_train_batches):
                    minibatch_avg_cost = train_model(minibatch_idx, n_train,
                                                     self.learning_rate,
                                                     effective_momentum)

                    # iteration number (how many weight updates have we made?)
                    # epoch is 1-based, index is 0 based
                    iter = (epoch - 1) * n_train_batches + minibatch_idx + 1

                    if iter % validation_frequency == 0:
                        # compute loss on training set
                        train_losses = [
                            compute_train_error(i, n_train)
                            for i in xrange(n_train_batches)
                        ]
                        train_batch_sizes = [
                            get_batch_size(i, n_train)
                            for i in xrange(n_train_batches)
                        ]

                        this_train_loss = np.average(train_losses,
                                                     weights=train_batch_sizes)

                        if compute_zero_one:
                            train_zero_one = [
                                compute_train_zo(i, n_train)
                                for i in xrange(n_train_batches)
                            ]

                            this_train_zero_one = np.average(
                                train_zero_one, weights=train_batch_sizes)

                        if self.interactive:
                            test_losses = [
                                compute_test_error(i, n_test)
                                for i in xrange(n_test_batches)
                            ]

                            test_batch_sizes = [
                                get_batch_size(i, n_test)
                                for i in xrange(n_test_batches)
                            ]

                            this_test_loss = np.average(
                                test_losses, weights=test_batch_sizes)

                            if compute_zero_one:
                                test_zero_one = [
                                    compute_test_zo(i, n_test)
                                    for i in xrange(n_test_batches)
                                ]

                                this_test_zero_one = np.average(
                                    test_zero_one, weights=test_batch_sizes)

                            if compute_zero_one:
                                logger.info('epoch %i, mb %i/%i, tr loss %f, '
                                            'tr zo %f, te loss %f '
                                            'te zo %f lr: %f' % \
                                        (epoch, minibatch_idx + 1,
                                         n_train_batches,
                                         this_train_loss, this_train_zero_one,
                                         this_test_loss, this_test_zero_one,
                                         self.learning_rate))
                            else:
                                logger.info('epoch %i, mb %i/%i, tr loss %f '
                                            'te loss %f lr: %f' % \
                                (epoch, minibatch_idx + 1, n_train_batches,
                                 this_train_loss, this_test_loss,
                                 self.learning_rate))

                        else:
                            if compute_zero_one:
                                logger.info(
                                    'epoch %i, mb %i/%i, train loss %f'
                                    ' train zo %f '
                                    'lr: %f' %
                                    (epoch, minibatch_idx + 1, n_train_batches,
                                     this_train_loss, this_train_zero_one,
                                     self.learning_rate))
                            else:
                                logger.info(
                                    'epoch %i, mb %i/%i, train loss %f'
                                    ' lr: %f' %
                                    (epoch, minibatch_idx + 1, n_train_batches,
                                     this_train_loss, self.learning_rate))

                        self.optional_output(train_set_x, show_norms,
                                             show_output)

                self.learning_rate *= self.learning_rate_decay

                if self.snapshot_every is not None:
                    if (epoch + 1) % self.snapshot_every == 0:
                        date_obj = datetime.datetime.now()
                        date_str = date_obj.strftime('%Y-%m-%d-%H:%M:%S')
                        class_name = self.__class__.__name__
                        fname = '%s.%s-snapshot-%d.pkl' % (class_name,
                                                           date_str, epoch + 1)
                        fabspath = os.path.join(self.snapshot_path, fname)
                        self.save(fpath=fabspath)

        elif optimizer == 'cg' or optimizer == 'bfgs' \
                 or optimizer == 'l_bfgs_b':
            # compile a theano function that returns the cost of a minibatch
            batch_cost = theano.function(
                inputs=[index, n_ex],
                outputs=cost,
                givens={
                    self.x: train_set_x[:, batch_start:batch_stop],
                    self.y: train_set_y[:, batch_start:batch_stop]
                },
                mode=mode,
                name="batch_cost")

            # compile a theano function that returns the gradient of the
            # minibatch with respect to theta
            batch_grad = theano.function(
                inputs=[index, n_ex],
                outputs=T.grad(cost, self.rnn.theta),
                givens={
                    self.x: train_set_x[:, batch_start:batch_stop],
                    self.y: train_set_y[:, batch_start:batch_stop]
                },
                mode=mode,
                name="batch_grad")

            # creates a function that computes the average cost on the training
            # set
            def train_fn(theta_value):
                self.rnn.theta.set_value(theta_value, borrow=True)
                train_losses = [
                    batch_cost(i, n_train) for i in xrange(n_train_batches)
                ]
                train_batch_sizes = [
                    get_batch_size(i, n_train) for i in xrange(n_train_batches)
                ]
                return np.average(train_losses, weights=train_batch_sizes)

            # creates a function that computes the average gradient of cost
            # with respect to theta
            def train_fn_grad(theta_value):
                self.rnn.theta.set_value(theta_value, borrow=True)

                train_grads = [
                    batch_grad(i, n_train) for i in xrange(n_train_batches)
                ]
                train_batch_sizes = [
                    get_batch_size(i, n_train) for i in xrange(n_train_batches)
                ]

                return np.average(train_grads,
                                  weights=train_batch_sizes,
                                  axis=0)

            # validation function, prints useful output after each iteration
            def callback(theta_value):
                self.epoch += 1
                if (self.epoch) % validate_every == 0:
                    self.rnn.theta.set_value(theta_value, borrow=True)
                    # compute loss on training set
                    train_losses = [
                        compute_train_error(i, n_train)
                        for i in xrange(n_train_batches)
                    ]
                    train_batch_sizes = [
                        get_batch_size(i, n_train)
                        for i in xrange(n_train_batches)
                    ]

                    this_train_loss = np.average(train_losses,
                                                 weights=train_batch_sizes)

                    if compute_zero_one:
                        train_zero_one = [
                            compute_train_zo(i, n_train)
                            for i in xrange(n_train_batches)
                        ]

                        this_train_zero_one = np.average(
                            train_zero_one, weights=train_batch_sizes)

                    if self.interactive:
                        test_losses = [
                            compute_test_error(i, n_test)
                            for i in xrange(n_test_batches)
                        ]

                        test_batch_sizes = [
                            get_batch_size(i, n_test)
                            for i in xrange(n_test_batches)
                        ]

                        this_test_loss = np.average(test_losses,
                                                    weights=test_batch_sizes)

                        if compute_zero_one:
                            test_zero_one = [
                                compute_test_zo(i, n_test)
                                for i in xrange(n_test_batches)
                            ]

                            this_test_zero_one = np.average(
                                test_zero_one, weights=test_batch_sizes)

                        if compute_zero_one:
                            logger.info('epoch %i, tr loss %f, '
                                        'tr zo %f, te loss %f '
                                            'te zo %f' % \
                                        (self.epoch, this_train_loss,
                                         this_train_zero_one, this_test_loss,
                                         this_test_zero_one))
                        else:
                            logger.info('epoch %i, tr loss %f, te loss %f' % \
                                        (self.epoch, this_train_loss,
                                         this_test_loss, self.learning_rate))

                    else:
                        if compute_zero_one:
                            logger.info('epoch %i, train loss %f'
                                        ', train zo %f ' % \
                                        (self.epoch, this_train_loss,
                                         this_train_zero_one))
                        else:
                            logger.info('epoch %i, train loss %f ' % \
                                        (self.epoch, this_train_loss))

                    self.optional_output(train_set_x, show_norms, show_output)

            ###############
            # TRAIN MODEL #
            ###############
            logger.info('... training')
            # using scipy conjugate gradient optimizer
            import scipy.optimize
            if optimizer == 'cg':
                of = scipy.optimize.fmin_cg
            elif optimizer == 'bfgs':
                of = scipy.optimize.fmin_bfgs
            elif optimizer == 'l_bfgs_b':
                of = scipy.optimize.fmin_l_bfgs_b
            logger.info("Optimizing using %s..." % of.__name__)
            start_time = time.clock()

            # keep track of epochs externally
            # these get updated through callback
            self.epoch = 0

            # interface to l_bfgs_b is different than that of cg, bfgs
            # however, this will be changed in scipy 0.11
            # unified under scipy.optimize.minimize
            if optimizer == 'cg' or optimizer == 'bfgs':
                best_theta = of(
                    f=train_fn,
                    x0=self.rnn.theta.get_value(),
                    # x0=np.zeros(self.rnn.theta.get_value().shape,
                    #             dtype=theano.config.floatX),
                    fprime=train_fn_grad,
                    callback=callback,
                    disp=1,
                    retall=1,
                    maxiter=self.n_epochs)
            elif optimizer == 'l_bfgs_b':
                best_theta, f_best_theta, info = of(
                    func=train_fn,
                    x0=self.rnn.theta.get_value(),
                    fprime=train_fn_grad,
                    iprint=validate_every,
                    maxfun=self.n_epochs)  # max number of feval

            end_time = time.clock()

            print "Optimization time: %f" % (end_time - start_time)

        else:
            raise NotImplementedError
    def build_model(self,
                    train_set_unlabeled,
                    train_set_labeled,
                    test_set,
                    validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(SDGM, self).build_model(train_set_unlabeled, test_set,
                                      validation_set)

        sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX)  # no. of data points
        n_l = sh_train_x_l.shape[0].astype(
            theano.config.floatX)  # no. of labeled data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu,
                                           self.l_qa_logvar)
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu,
                                           self.l_qz_logvar)
        l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8)

        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)
        l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu,
                                           self.l_pa_logvar)
        if self.x_dist == 'bernoulli':
            l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_log_px = MultinomialLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'gaussian':
            l_log_px = GaussianLogDensityLayer(self.l_x_in, self.l_px_mu,
                                               self.l_px_logvar)

        def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px):
            lb = log_px + log_py + log_pz + log_pa - log_qa - log_qz
            return lb

        # Lower bound for labeled data
        out_layers = [
            l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy
        ]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out
        # Prior p(y) expecting that all classes are evenly distributed
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l,
                           log_py_l, log_px_zy_l)
        lb_l = lb_l.mean(axis=(1, 2))  # Mean over the sampling dimensions
        log_qy_ax_l *= (
            self.sym_beta * (n / n_l)
        )  # Scale the supervised cross entropy with the alpha constant
        lb_l += log_qy_ax_l.mean(axis=(
            1, 2
        ))  # Collect the lower bound term and mean over sampling dimensions

        # Lower bound for unlabeled data
        bs_u = self.sym_x_u.shape[0]

        # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y
        # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form
        #               x_repeat                     t_repeat
        #  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]
        t_eye = T.eye(self.n_y, k=0)
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u,
                                                            axis=1).reshape(
                                                                (-1, self.n_y))
        x_u = self.sym_x_u.reshape(
            (1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape(
                (-1, self.n_x))

        # Since the expectation of var a is outside the integration we calculate E_q(a|x) first
        a_x_u = get_output(self.l_qa,
                           self.sym_x_u,
                           batch_norm_update_averages=True,
                           batch_norm_use_averages=False)
        a_x_u_rep = a_x_u.reshape(
            (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y,
                                                           axis=0).reshape(
                                                               (-1, self.n_a))
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out
        # Prior p(y) expecting that all classes are evenly distributed
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u,
                           log_py_u, log_px_zy_u)
        lb_u = lb_u.reshape(
            (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2))
        inputs = {
            self.l_x_in: self.sym_x_u,
            self.l_a_in: a_x_u.reshape((-1, self.n_a))
        }
        y_u = get_output(self.l_qy,
                         inputs,
                         batch_norm_update_averages=True,
                         batch_norm_use_averages=False).mean(axis=(1, 2))
        y_u += 1e-8  # Ensure that we get no NANs when calculating the entropy
        y_u /= T.sum(y_u, axis=1, keepdims=True)
        lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1)

        if self.batchnorm:
            # TODO: implement the BN layer correctly.
            inputs = {
                self.l_x_in: self.sym_x_u,
                self.l_y_in: y_u,
                self.l_a_in: a_x_u
            }
            get_output(out_layers,
                       inputs,
                       weighting=None,
                       batch_norm_update_averages=True,
                       batch_norm_use_averages=False)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n
        lb_labeled = -lb_l.mean()
        lb_unlabeled = -lb_u.mean()

        grads_collect = T.grad(elbo, self.trainable_model_params)
        params_collect = self.trainable_model_params
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1,
                       sym_beta2)

        # Training function
        indices = self._srng.choice(size=[self.sym_bs_l],
                                    a=sh_train_x_l.shape[0],
                                    replace=False)
        x_batch_l = sh_train_x_l[indices]
        t_batch_l = sh_train_t_l[indices]
        x_batch_u = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape,
                                            n=1,
                                            p=x_batch_u,
                                            dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape,
                                            n=1,
                                            p=x_batch_l,
                                            dtype=theano.config.floatX)

        givens = {
            self.sym_x_l: x_batch_l,
            self.sym_x_u: x_batch_u,
            self.sym_t_l: t_batch_l
        }
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
            self.sym_lr, sym_beta1, sym_beta2, self.sym_samples
        ]
        outputs = [elbo, lb_labeled, lb_unlabeled]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize_unlabeled'] = 100
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 0.1
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['outputs']['lb'] = '%0.4f'
        self.train_args['outputs']['lb-labeled'] = '%0.4f'
        self.train_args['outputs']['lb-unlabeled'] = '%0.4f'

        # Validation and test function
        y = get_output(self.l_qy, self.sym_x_l,
                       deterministic=True).mean(axis=(1, 2))
        class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100
        givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples],
                                 outputs=[class_err],
                                 givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['test'] = '%0.2f%%'

        f_validate = None
        if validation_set is not None:
            givens = {
                self.sym_x_l: self.sh_valid_x,
                self.sym_t_l: self.sh_valid_t
            }
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[class_err],
                                         givens=givens)
        # Default validation args. Note that these can be changed during or prior to training.
        self.validate_args['inputs']['samples'] = 1
        self.validate_args['outputs']['validation'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args