Example #1
0
    def __call__(self, model, X):
        if X.name is None:
            X_name = 'X'
        else:
            X_name = X.name


        m_data = X.shape[0]
        m_noise = m_data * self.noise_per_clean

        Y = self.noise.random_design_matrix(m_noise)

        #Y = Print('Y',attrs=['min','max'])(Y)

        #hx = self.h(X, model)
        #hy = self.h(Y, model)

        log_hx = -T.nnet.softplus(-self.G(X,model))
        log_one_minus_hy = -T.nnet.softplus(self.G(Y,model))


        #based on equation 3 of the paper
        #ours is the negative of theirs because they maximize it and we minimize it
        rval = -T.mean(log_hx)-T.mean(log_one_minus_hy)

        rval.name = 'NCE('+X_name+')'

        return rval
Example #2
0
    def __init__(self, x, y, in_size, out_size, prefix='lr_'):

        self.W = theano.shared(
            value=np.random.uniform(
                low=-np.sqrt(6. / (in_size + out_size)),
                high=np.sqrt(6. / (in_size + out_size)),
                size=(in_size, out_size)
            ).astype(theano.config.floatX),
            name='W',
            borrow=True
        )

        self.b = theano.shared(
            value=np.random.uniform(
                low=-np.sqrt(6. / (in_size + out_size)),
                high=np.sqrt(6. / (in_size + out_size)),
                size=(out_size,)
            ).astype(theano.config.floatX),
            name='b',
            borrow=True
        )

        self.y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b)

        self.y_d = T.argmax(self.y_given_x, axis=1)

        self.loss = -T.mean(T.log(self.y_given_x)[T.arange(y.shape[0]), y])

        self.error = T.mean(T.neq(self.y_d, y))

        self.params = {prefix+'W': self.W, prefix+'b': self.b}
Example #3
0
File: rbm.py Project: MarcCote/iRBM
    def get_updates(self, v):
        # Contrastive divergence
        chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk)

        # [Expected] negative log-likelihood
        cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0)

        #Regularization
        cost += self.regularization

        # Gradients (use automatic differentiation)
        # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant
        gparams = T.grad(cost, self.parameters, consider_constant=[chain_end])
        gradients = dict(zip(self.parameters, gparams))

        # Get learning rates for all params given their gradient.
        lr, updates_lr = self.learning_rate(gradients)

        updates = OrderedDict()
        updates.update(updates_CD)  # Add updates from CD
        updates.update(updates_lr)  # Add updates from learning_rate

        # Updates parameters
        for param, gparam in gradients.items():
            updates[param] = param - lr[param] * gradients[param]

        return updates
Example #4
0
    def get_cost_updates(self, contraction_level, learning_rate, cost_measure="cross_entropy"):
        """ This function computes the cost and the updates for one trainng
        step of the cA """

        y = self.get_hidden_values(self.x)
        z = self.get_reconstructed_input(y)
        J = self.get_jacobian(y, self.W)

        if cost_measure=="cross_entropy":
            #self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1)
            self.L_rec = T.mean(- T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z),axis=1))
        elif cost_measure=="euclidean":
            self.L_rec = T.mean(T.sum((self.x-z)**2,axis=1)) 
            
        # Compute the jacobian and average over the number of samples/minibatch
        self.L_jacob = T.mean(T.sum(J ** 2) / self.n_batchsize)
        
        cost = self.L_rec + contraction_level * self.L_jacob

        # compute the gradients of the cost of the `cA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - learning_rate * gparam))

        return (cost, updates)
Example #5
0
    def get_lossfun(self, l1, l2):
        """
        Generate a loss function
        
        The default one is mean negative log-likelihood
        
        :param l1: weight of L1 term, None for no L1 term
        :param l2: weight of L2 term, None for no L2 term
        """

        if self.ff_net.layers[-1].activation_name == 'softmax':
                q = -T.mean(     # minimize negative log-likelihood
                        T.log(
                                self.ff_net.get_learning_passthrough(self.x)
                             )
                            [T.arange(self.y.shape[0]), self.y]
                          )
        else:
            q = T.mean(     # minimize error function
                        (self.ff_net.get_learning_passthrough(self.x) - self.y)**2
            )

        try:
            if l1 is not None:
                q = q + self.ff_net.l1 * l1
        except AttributeError:
            pass

        try:
            if l2 is not None:
                q = q + self.ff_net.l2 * l2
        except AttributeError:
            pass

        return q
Example #6
0
 def ml_cost(self, pos_v, neg_v):
     pos_cost = T.mean(self.free_energy_v(pos_v))
     # Only the temperature 1 samples are used to compute the gradient.
     neg_cost = T.mean(self.free_energy_v(neg_v[:self.batch_size]))
     cost = pos_cost - neg_cost
     # build gradient of cost with respect to model parameters
     return costmod.Cost(cost, self.params(), [pos_v, neg_v])
Example #7
0
 def _test_layer_stats(self, layer_output):
     """
     DESCRIPTION:
         This method is called every batch whereby the examples from test or valid set 
         is pass through, the final result will be the mean of all the results from all 
         the batches in an epoch from the test set or valid set.
     PARAM:
         layer_output: the output from the layer
     RETURN:
         A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar 
     """
     
     w_len = T.sqrt((self.W ** 2).sum(axis=0))
     max_length = T.max(w_len)
     mean_length = T.mean(w_len)
     min_length = T.min(w_len)
     
     return [('max_col_length', max_length),
             ('mean_col_length', mean_length),
             ('min_col_length', min_length), 
             ('output_max', T.max(layer_output)),
             ('output_mean', T.mean(layer_output)), 
             ('output_min', T.min(layer_output)),
             ('max_W', T.max(self.W)),
             ('mean_W', T.mean(self.W)),
             ('min_W', T.min(self.W)),
             ('max_b', T.max(self.b)),
             ('mean_b', T.mean(self.b)),
             ('min_b', T.min(self.b))]
Example #8
0
    def get_cost_updates(self, learning_rate, lam = 0.0001, beta=3, rho = 0.1):
        """
        :type scalar
        :param learning_rate: rate which weighs the gradient step

        :type scalar
        :param lam: regularization parameter for the cost function

        :type pair (cost, update)
        :return: compute cost and update for one training step of the autoencoder
        """

        # y holds all the minibatch-processed vectors

        h = self.get_hidden_values(self.X)
        y = self.get_output(h)

        # Compute the cost
        l2_squared = (self.Wvis ** 2).sum() + (self.Whid ** 2).sum()
        KL = T.abs_(rho - T.mean(h))               # True KL?? How to deal with distribution...T.log(T.true_div(rho,rho_hat))
        cost = 0.5*T.mean((y - self.X) ** 2)+0.5*lam*l2_squared # + beta*KL


        # Compute updates
        gparams = T.grad(cost, self.params)
        updates = [(param, param - learning_rate * gparam)
            for param, gparam in zip(self.params, gparams)]

        return cost, updates
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8):
    """
    batchnorm with support for not using scale and shift parameters
    as well as inference values (u and s) and partial batchnorm (via a)
    will detect and use convolutional or fully connected version
    """
    g = rescale
    b = reshift
    if X.ndim == 4:
        if u is not None and s is not None:
            # use normalization params given a priori
            b_u = u.dimshuffle('x', 0, 'x', 'x')
            b_s = s.dimshuffle('x', 0, 'x', 'x')
        else:
            # compute normalization params from input
            b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
            b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
        # batch normalize
        X = (X - b_u) / T.sqrt(b_s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x')
    elif X.ndim == 2:
        if u is None and s is None:
            # compute normalization params from input
            u = T.mean(X, axis=0)
            s = T.mean(T.sqr(X - u), axis=0)
        # batch normalize
        X = (X - u) / T.sqrt(s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g) + b
    else:
        raise NotImplementedError
    return X
Example #10
0
    def _get_cost_update(self, lr=0.1, persistent=None, k=1):
        # compute positive phase
        chain_start = self.x

        [pre_sigmoid_nvs, nv_means, nv_samples], updates = \
            theano.scan(self._gibbs_vhv,
                        outputs_info=[None, None, chain_start],
                        n_steps=k)

        # determine gradients on RBM parameters
        # note that we only need the sample at the end of the chain
        chain_end = nv_samples[-1]

        # Contrastive Loss, different from AE(cross entropy loss)
        cost = T.mean(self.free_energy(chain_start)) - T.mean(
            self.free_energy(chain_end))

        # We must not compute the gradient through the gibbs sampling
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])

        # constructs the update dictionary
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(lr,
                                                     dtype=theano.config.floatX)

        monitoring_cost = self._get_reconstruction_cost(pre_sigmoid_nvs[-1])

        return monitoring_cost, updates
Example #11
0
    def get_monitoring_channels(self, V):

        try:
            self.compile_mode()

            rval = {}

            #from_ip = self.inference_procedure.get_monitoring_channels(V, self)

            #rval.update(from_ip)

            if self.monitor_params:
                for param in self.get_params():
                    rval[param.name + '_min'] = full_min(param)
                    rval[param.name + '_mean'] = T.mean(param)
                    rval[param.name + '_max'] = full_max(param)

                    if 'W' in param.name:
                        norms = theano_norms(param)

                        rval[param.name + '_norms_min' ]= T.min(norms)
                        rval[param.name + '_norms_mean'] = T.mean(norms)
                        rval[param.name + '_norms_max'] = T.max(norms)

            new_rval = {}
            for key in rval:
                new_rval[self.monitoring_channel_prefix+key] = rval[key]

            rval = new_rval

            return rval
        finally:
            self.deploy_mode()
Example #12
0
    def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None):
        self.optimizer = optimizers.get(optimizer)

        self.loss = objectives.get(loss)
        weighted_loss = weighted_objective(objectives.get(loss))

        # input of model
        self.X_train = self.get_input(train=True)
        self.X_test = self.get_input(train=False)

        self.y_train = self.get_output(train=True)
        self.y_test = self.get_output(train=False)

        # target of model
        self.y = T.zeros_like(self.y_train)

        self.weights = T.ones_like(self.y_train)

        train_loss = weighted_loss(self.y, self.y_train, self.weights)
        test_loss = weighted_loss(self.y, self.y_test, self.weights)

        train_loss.name = 'train_loss'
        test_loss.name = 'test_loss'
        self.y.name = 'y'

        if class_mode == "categorical":
            train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1)))
            test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1)))

        elif class_mode == "binary":
            train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)))
            test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)))
        else:
            raise Exception("Invalid class mode:" + str(class_mode))
        self.class_mode = class_mode
        self.theano_mode = theano_mode

        for r in self.regularizers:
            train_loss = r(train_loss)
        updates = self.optimizer.get_updates(self.params, self.constraints, train_loss)

        if type(self.X_train) == list:
            train_ins = self.X_train + [self.y, self.weights]
            test_ins = self.X_test + [self.y, self.weights]
            predict_ins = self.X_test
        else:
            train_ins = [self.X_train, self.y, self.weights]
            test_ins = [self.X_test, self.y, self.weights]
            predict_ins = [self.X_test]

        self._train = theano.function(train_ins, train_loss,
            updates=updates, allow_input_downcast=True, mode=theano_mode)
        self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy],
            updates=updates, allow_input_downcast=True, mode=theano_mode)
        self._predict = theano.function(predict_ins, self.y_test,
            allow_input_downcast=True, mode=theano_mode)
        self._test = theano.function(test_ins, test_loss,
            allow_input_downcast=True, mode=theano_mode)
        self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy],
            allow_input_downcast=True, mode=theano_mode)
Example #13
0
	def cost_updates(self,lr,data,k=1):
		ph_activation_scores = T.dot(data,self.W) + self.h_bias
		ph_activation_probs, ph_samples, ph_updates  = self.h.sample(ph_activation_scores)

		chain_start = ph_samples

		[nv_activation_scores,nv_activation_probs,nv_samples,\
		 nh_activation_scores,nh_activation_probs,nh_samples], updates = \
		theano.scan(
				 self.gibbs_hvh,
				 outputs_info = [None,None,None,None,None,chain_start],
				 n_steps      = k
			)
		chain_end = nv_samples[-1]
		cost = T.mean(self.free_energy(data))\
				- T.mean(self.free_energy(chain_end))\
				 + self.regularisation()

		gparams = T.grad(cost,self.tunables,consider_constant=[chain_end])

		alpha = T.cast(self.momentum,dtype=theano.config.floatX)
		updates = [
				( param, param - ( alpha * prev_chg + gparam * lr ) )
		   		for gparam,param,prev_chg in zip(gparams,self.tunables,self.deltas)
		   ] + [
				( prev_chg, alpha * prev_chg + gparam * lr )
				for prev_chg,gparam in zip(self.deltas,gparams)
		   ]# + ph_updates + nv_updates + nh_updates

		monitoring_cost = self.reconstruction_cost(updates,nv_activation_scores[-1],data)

		return monitoring_cost,updates
 def error_classification(self,target):
     output, updates = theano.scan(fn=lambda a: T.nnet.softmax(a),
                           sequences=[self.output])
     y=T.mean(output,0)
     self.y_pred = T.argmax(y, axis=1)
     label=T.argmax(target, axis=1)
     return T.mean(T.neq(self.y_pred, label))
    def add_regularization(self, layer):
        regularization = 0

        if self._recon_strategy == 'forward':
            input_x = layer.x
            recon_x = layer.reconstruct_x()

            input_y = layer.y
            recon_y = layer.reconstruct_y()

            regularization += Tensor.mean((abs(input_x - recon_x)).sum(axis=1, dtype=Tensor.config.floatX))
            regularization += Tensor.mean((abs(input_y - recon_y)).sum(axis=1, dtype=Tensor.config.floatX))
        elif self._recon_strategy == 'backward':
            input_x = layer.x
            recon_x = Tensor.dot(layer.output_forward_x,
                                 layer.Wx.T)

            input_y = layer.y
            recon_y = Tensor.dot(layer.output_forward_y,
                                 layer.Wy.T)

            regularization += Tensor.mean((abs(input_x - recon_x)).sum(axis=1, dtype=Tensor.config.floatX))
            regularization += Tensor.mean((abs(input_y - recon_y)).sum(axis=1, dtype=Tensor.config.floatX))

        return regularization
Example #16
0
def unet_crossentropy_loss_sampled(y_true, y_pred):
    print 'unet_crossentropy_loss_sampled'
    epsilon = 1.0e-4
    y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon))
    y_true = T.flatten(y_true)
    # this seems to work
    # it is super ugly though and I am sure there is a better way to do it
    # but I am struggling with theano to cooperate
    # filter the right indices
    indPos = T.nonzero(y_true)[0] # no idea why this is a tuple
    indNeg = T.nonzero(1-y_true)[0]
    # shuffle
    n = indPos.shape[0]
    indPos = indPos[srng.permutation(n=n)]
    n = indNeg.shape[0]
    indNeg = indNeg[srng.permutation(n=n)]
    # take equal number of samples depending on which class has less
    n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64')

    indPos = indPos[:n_samples]
    indNeg = indNeg[:n_samples]
    loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg]))
    average_loss = T.mean(loss_vector)
    print 'average_loss:', average_loss
    return average_loss
Example #17
0
def stddev_bias(x, eps, axis=0):
    mu = T.mean(x + eps, axis=axis)
    mu.name = "std_mean"
    var = T.mean((x - mu)**2 + eps)
    var.name = "std_variance"
    stddev = T.sqrt(var)
    return stddev
Example #18
0
def image_categorical_crossentropy(output, target, from_logits=False):
    output = T.clip(output, _EPSILON, 1.0 - _EPSILON)
    output_ = K.reshape(output, (-1, 256))
    target_ = K.reshape(target, (-1, 256))
    out = T.nnet.categorical_crossentropy(output_, target_)
    out = K.reshape(out,(K.shape(output)[0],-1))
    return T.mean(T.mean(out, axis=1))
Example #19
0
def unet_crossentropy_loss(y_true, y_pred):
    weight_class_1 = 1.
    epsilon = 1.0e-4
    y_pred_clipped = T.clip(y_pred, epsilon, 1.0-epsilon)
    loss_vector = -T.mean(weight_class_1*y_true * T.log(y_pred_clipped) + (1-y_true) * T.log(1-y_pred_clipped), axis=1)
    average_loss = T.mean(loss_vector)
    return average_loss
Example #20
0
    def finetune_cost_updates(self, center, mu, learning_rate):
        """ This function computes the cost and the updates ."""

        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, withd one entry per
        #        example in minibatch
        network_output = self.get_output()
        temp = T.pow(center - network_output, 2)    
        
        L =  T.sum(temp, axis=1) 
        # Add the network reconstruction error 
        z = self.get_network_reconst()
        reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1)            
        L = self.beta*L + self.lbd*reconst_err
        
        cost1 = T.mean(L)
        cost2 = self.lbd*T.mean(reconst_err)  
        cost3 = cost1 - cost2

        # compute the gradients of the cost of the `dA` with respect
        # to its parameters
        gparams = T.grad(cost1, self.params)  
        # generate the list of updates
        updates = []
        grad_values = []
        param_norm = []
        for param, delta, gparam in zip(self.params, self.delta, gparams):
            updates.append( (delta, mu*delta - learning_rate * gparam) )
            updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam ))
            grad_values.append(gparam.norm(L=2))
            param_norm.append(param.norm(L=2))
        
        grad_ = T.stack(*grad_values)
        param_ = T.stack(*param_norm)
        return ((cost1, cost2, cost3, grad_, param_), updates)
Example #21
0
    def get_output_for(self, input, init=False, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        
        activation = T.tensordot(input, self.W, [[1], [0]])
        abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2)
                    + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1))

        if init:
            mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0)
            abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x')
            self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))]
        
        f = T.sum(T.exp(-abs_dif),axis=2)

        if init:
            mf = T.mean(f,axis=0)
            f -= mf.dimshuffle('x',0)
            self.init_updates.append((self.b, -mf))
        else:
            f += self.b.dimshuffle('x',0)

        return T.concatenate([input, f], axis=1)
Example #22
0
    def get_cost_updates(self, contraction_level, learning_rate):
        """ This function computes the cost and the updates for one trainng
        step of the cA """

        y = self.get_hidden_values(self.x)
        z = self.get_reconstructed_input(y)
        J = self.get_jacobian(y, self.W)
        # note : we sum over the size of a datapoint; if we are using
        #        minibatches, L will be a vector, with one entry per
        #        example in minibatch
        self.L_rec = - T.sum(self.x * T.log(z) +
                             (1 - self.x) * T.log(1 - z),
                             axis=1)

        # Compute the jacobian and average over the number of samples/minibatch
        self.L_jacob = T.sum(J ** 2) // self.n_batchsize

        # note : L is now a vector, where each element is the
        #        cross-entropy cost of the reconstruction of the
        #        corresponding example of the minibatch. We need to
        #        compute the average of all these to get the cost of
        #        the minibatch
        cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob)

        # compute the gradients of the cost of the `cA` with respect
        # to its parameters
        gparams = T.grad(cost, self.params)
        # generate the list of updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - learning_rate * gparam))

        return (cost, updates)
Example #23
0
    def __init__(self, fin, f1, nin1, f2, nin2, f3, nin3, expand, h1, outputs,
                 lr, C, pDropConv=0.2, pDropHidden=0.5):
        # 超参数
        self.lr = lr
        self.C = C
        self.pDropConv = pDropConv
        self.pDropHidden = pDropHidden
        # 所有需要优化的参数放入列表中,分别是连接权重和偏置
        self.params = []
        self.paramsNIN = []
        self.paramsConv = []
        # 卷积层,w=(本层特征图个数,上层特征图个数,卷积核行数,卷积核列数),b=(本层特征图个数)
        self.paramsNIN.append(layerNINParams((f1, fin, nin1, 3, 3), expand))
        self.paramsNIN.append(layerNINParams((f2, f1 * expand, nin2, 3, 3), expand))
        self.paramsNIN.append(layerNINParams((f3, f2 * expand, nin3, 3, 3), expand))
        # 全局平均池化层
        self.paramsConv.append(layerConvParams((h1, f3 * expand, 1, 1)))
        self.paramsConv.append(layerConvParams((outputs, h1, 1, 1)))
        self.params = self.paramsNIN + self.paramsConv

        # 定义 Theano 符号变量,并构建 Theano 表达式
        self.X = T.tensor4('X')
        self.Y = T.matrix('Y')
        # 训练集代价函数
        YDropProb = model(self.X, self.params, pDropConv, pDropHidden)
        self.trNeqs = basicUtils.neqs(YDropProb, self.Y)
        trCrossEntropy = categorical_crossentropy(YDropProb, self.Y)
        self.trCost = T.mean(trCrossEntropy) + C * basicUtils.regularizer(flatten(self.params))

        # 测试验证集代价函数
        YFullProb = model(self.X, self.params, 0., 0.)
        self.vateNeqs = basicUtils.neqs(YFullProb, self.Y)
        self.YPred = T.argmax(YFullProb, axis=1)
        vateCrossEntropy = categorical_crossentropy(YFullProb, self.Y)
        self.vateCost = T.mean(vateCrossEntropy) + C * basicUtils.regularizer(flatten(self.params))
Example #24
0
def test_minres_with_jacobi():
    vv = theano.shared(v, name='v')
    gg = theano.shared(g, name='g')
    hh = theano.shared(h, name='h')
    dw = T.dot(v.T,g) / M
    dv = T.dot(g.T,h) / M
    da = T.mean(v, axis=0)
    db = T.mean(g, axis=0)
    dc = T.mean(h, axis=0)
   
    Ldiag_terms = natural.generic_compute_L_diag([vv,gg,hh])
    Ms = [Ldiag_term + 0.1 for Ldiag_term in Ldiag_terms]

    newgrads = minres.minres(
            lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc),
            [dw, dv, da, db, dc],
            rtol=1e-5,
            damp = 0.,
            maxiter = 10000,
            Ms = Ms,
            profile=0)[0]

    f = theano.function([], newgrads)
    [new_dw, new_dv, new_da, new_db, new_dc] = f()
    numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
Example #25
0
def test_linearcg():
    vv = theano.shared(v, name='v')
    gg = theano.shared(g, name='g')
    hh = theano.shared(h, name='h')
    dw = T.dot(v.T,g) / M
    dv = T.dot(g.T,h) / M
    da = T.mean(v, axis=0)
    db = T.mean(g, axis=0)
    dc = T.mean(h, axis=0)

    newgrads = lincg.linear_cg(
            lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc),
            [dw, dv, da, db, dc],
            rtol=1e-5,
            maxiter = 30,
            damp = 0.,
            floatX = floatX,
            profile=0)

    f = theano.function([], newgrads)
    [new_dw, new_dv, new_da, new_db, new_dc] = f()
    numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
Example #26
0
    def plotUpdate(self,updates):
        '''
        >>>get update info of each layer
        >>>type updates: dict
        >>>para updates: update dictionary
        '''
        maxdict=T.zeros(shape=(self.deep*2+1,))
        mindict=T.zeros(shape=(self.deep*2+1,))
        meandict=T.zeros(shape=(self.deep*2+1,))
        
        for i in xrange(self.deep):
            updw=updates[self.layers[i].w]-self.layers[i].w
            maxdict=T.set_subtensor(maxdict[2*i],T.max(updw))
            mindict=T.set_subtensor(mindict[2*i],T.min(updw))
            meandict=T.set_subtensor(meandict[2*i],T.mean(updw))
            updb=updates[self.layers[i].b]-self.layers[i].b
            maxdict=T.set_subtensor(maxdict[2*i+1],T.max(updb))
            mindict=T.set_subtensor(mindict[2*i+1],T.min(updb))
            meandict=T.set_subtensor(meandict[2*i+1],T.mean(updb))

        updw=updates[self.classifier.w]-self.classifier.w
        maxdict=T.set_subtensor(maxdict[self.deep*2],T.max(updw))
        mindict=T.set_subtensor(mindict[self.deep*2],T.min(updw))
        meandict=T.set_subtensor(meandict[self.deep*2],T.mean(updw))
        return [maxdict,mindict,meandict]
Example #27
0
def test_minres_with_xinit():
    rng = numpy.random.RandomState(123412)

    vv = theano.shared(v, name='v')
    gg = theano.shared(g, name='g')
    hh = theano.shared(h, name='h')
    dw = T.dot(v.T,g) / M
    dv = T.dot(g.T,h) / M
    da = T.mean(v, axis=0)
    db = T.mean(g, axis=0)
    dc = T.mean(h, axis=0)
  
    xinit = [ rng.rand(N0,N1),
              rng.rand(N1,N2),
              rng.rand(N0),
              rng.rand(N1),
              rng.rand(N2)]
    xinit = [xi.astype(floatX) for xi in xinit]

    newgrads = minres.minres(
            lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc),
            [dw, dv, da, db, dc],
            rtol=1e-5,
            damp = 0.,
            maxiter = 10000,
            xinit = xinit,
            profile=0)[0]

    f = theano.function([], newgrads)
    [new_dw, new_dv, new_da, new_db, new_dc] = f()
    numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1)
    numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
Example #28
0
    def forward(self,input_org,train=True,update_batch_stat=True,finetune=False):
        print "Layer/BatchNormalization"
        ldim,cdim,rdim = self._internal_shape(input_org)
        input = input_org.reshape((ldim,cdim,rdim))
        if (train):
            mean = T.mean(input, axis=(0, 2), keepdims=True )
            var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True)

            if(update_batch_stat):
                finetune_N = theano.clone(self.finetune_N, share_inputs=False)
                if(finetune):
                    finetune_N.default_update = finetune_N+1
                    ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX)
                else:
                    finetune_N.default_update = 0
                    ratio = self.moving_avg_ratio
                m = ldim*rdim
                scale = T.cast(m/(m-1.0),theano.config.floatX)
                est_mean = theano.clone(self.est_mean, share_inputs=False)
                est_var = theano.clone(self.est_var, share_inputs=False)
                est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX)
                est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX)
                mean += 0 * est_mean
                var += 0 * est_var
            output = self._pbc(self.gamma) * (input - self._pbc(mean)) \
                     / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta)

        else:
            output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \
                     / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta)

        return output.reshape(input_org.shape)
Example #29
0
    def negative_log_likelihood(self, y):
        """ Return the mean of the negative log-likelihood of the prediction
        of this model under a given target distribution.

        .. math::
            \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
            \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
                    \ell (\theta=\{W,b\}, \mathcal{D})

        :type y: theano.tensor.TensorType
        :param y: corresponds to a vector that gives for each example the
                  correct label
        Note: we use the mean instead of the sum so that
              the learning rate is less dependent on the batch size
        """

        # y.shape[0] is (symbolically) the number of rows in y, i.e.,
        # number of examples (call it n) in the minibatch
        # T.arange(y.shape[0]) is a symbolic vector which will contain
        # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
        # Log-Probabilities (call it LP) with one row per example and
        # one column per class LP[T.arange(y.shape[0]),y] is a vector
        # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
        # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
        # the mean (across minibatch examples) of the elements in v,
        # i.e., the mean log-likelihood across the minibatch.
        if self.is_binary:
            -T.mean(T.log(self.p_y_given_x))
        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
Example #30
0
File: pl4.py Project: wuhu/pll
    def create_learn_function(self):
        losses = sum([c.get_lin_losses() for c in self.clauses if True or c.has_free_argument], [])

        ws = []
        if any([p.arity == 1 for p in self.predicates.values()]):
            ws += [l.sum_layer.w for l in cortex.hidden_x.layers]
            ws += [l.sum_layer.b for l in cortex.hidden_x.layers]
        if any([p.arity == 2 for p in self.predicates.values()]):
            ws += [l.sum_layer.w for l in cortex.hidden_xy.layers]
            ws += [l.sum_layer.b for l in cortex.hidden_xy.layers]

        for p in self.predicates.values():
            ws.append(p.out_layer.w)
            ws.append(p.out_layer.b)

        alpha = theano.tensor.fscalar()
        regularisation = alpha * tensor.mean([tensor.mean(w ** 2) for w in ws])

        ws += [self.constant_representations]


        do_update = theano.tensor.bscalar()
        rp = net3.Momentum(ws, do_update * (tensor.mean(losses) + regularisation))
        updates, lr = rp.get_updates()
        self.rp = rp
        self.learn_function = function([lr, alpha, do_update], tensor.mean(losses), updates=updates, on_unused_input="ignore")
Example #31
0
 def cross_entropy(self):
     self.prob_of_y_given_x = T.nnet.softmax(self.x)
     return T.mean(
         T.nnet.categorical_crossentropy(self.prob_of_y_given_x, self.y))
    def create_objectives(self, deterministic=False):
        # load network input
        X = self.inputs[0]
        x = X.flatten(2)

        # duplicate entries to take into account multiple mc samples
        n_sam = self.n_sample
        n_out = x.shape[1]
        x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out))

        # load network
        l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
        l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \
        l_qa, l_qz = self.network
        l_qa_in, l_px_in = self.input_layers

        # load network output
        qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \
            = lasagne.layers.get_output(
                [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz],
                deterministic=deterministic,
            )
        pa_mu, pa_logsigma = lasagne.layers.get_output(
            [l_pa_mu, l_pa_logsigma],
            {l_px_in: z},
            deterministic=deterministic,
        )

        if self.model == 'bernoulli':
            px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z},
                                              deterministic=deterministic)
        elif self.model == 'gaussian':
            px_mu, px_logsigma = lasagne.layers.get_output(
                [l_px_mu, l_px_logsigma],
                {l_px_in: z},
                deterministic=deterministic,
            )

        # entropy term
        log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
        log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1)
        log_qza_given_x = log_qz_given_ax + log_qa_given_x

        # log-probability term
        z_prior_sigma = T.cast(T.ones_like(qz_logsigma),
                               dtype=theano.config.floatX)
        z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX)
        log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1)
        log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)

        if self.model == 'bernoulli':
            log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
        elif self.model == 'gaussian':
            log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1)

        log_paxz = log_pa_given_z + log_px_given_z + log_pz

        # compute the evidence lower bound
        elbo = T.mean(log_paxz - log_qza_given_x)

        # we don't use a spearate accuracy metric right now
        return -elbo, T.mean(qz_logsigma)
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-2,
            mu=0.9,
            decay=0.9,
            epochs=10,
            batch_sz=100,
            show_fig=False):
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        updates = momentum_updates(cost, self.params, learning_rate, mu)

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
import theano.tensor as T

X = T.dmatrix()
y = T.ivector()

prepare_data = lambda x: (theano.shared(x[0].astype('float64')),
                          theano.shared(x[1].astype('int32')))
(training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map(
    prepare_data, [train_set, test_set, valid_set])

W = theano.shared(numpy.zeros([dims, n_classes]))
b = theano.shared(numpy.zeros(n_classes))

y_hat = T.nnet.softmax(T.dot(X, W) + b)
y_pred = T.argmax(y_hat, axis=1)
test_error = T.mean(T.neq(y_pred, y))
training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y])

learning_rate = 0.2
params = [W, b]
beta = .9
updates = []
for p in params:
    ms = theano.shared(1. + 0. * p.get_value())
    updates += [
        (p, p - learning_rate * T.grad(training_error, p) / T.sqrt(ms)),
        (ms, beta * ms + (1 - beta) * T.sqr(T.grad(training_error, p)))
    ]

idx = T.ivector()
training_function = theano.function(inputs=[idx],
Example #35
0
 def negative_log_likelihood(self,y):
     
      return -T.mean((self.p_y_given_x)[T.arange(y.shape[0]),y])
Example #36
0
def downsample2d_nearest_neighbour(x, scale=2):
    x = x.reshape((x.shape[0], x.shape[1], x.shape[2]/scale, scale, x.shape[3]/scale, scale))
    x = T.mean(x, axis=5)
    x = T.mean(x, axis=3)
    return x
Example #37
0
 def mean_squared_error(self):
     return T.mean((self.x - self.y)**2)
Example #38
0
 def accuracy(self, y):
     "Return the accuracy for the mini-batch."
     return T.mean(T.eq(y, self.y_out))
Example #39
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-4,
            mu=0.9,
            decay=0.9,
            epochs=8,
            batch_sz=100,
            show_fig=False):
        '''
        Takes training data and test data (valid) at once, then trains and
        validates along the way. Modifying hyperparams of learning_rate, mu,
        decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether
        to display a figure are passed as optional variables.
        '''
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D  # first input layer is the number of features in X
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)  # layer ID is just the number
            self.hidden_layers.append(h)
            M1 = M2  # input layer to next layer is this layer.
            count += 1
        # output layer weights (last hidden layer to K output classes)
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)  # function to calc prob Y given X

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))

        # gradients wrt each param
        grads = T.grad(cost, self.params)

        # for momentum
        '''
        np.zeros_like(array) returns an array(/matrix) of the same shape and
        type of the given array. Very cool, never seen this before.
        '''
        dparams = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]

        # for rmsprop, initialize cache as 1
        cache = [
            theano.shared(np.ones_like(p.get_value())) for p in self.params
        ]
        '''
        Noting for myself that I've never seen this way of using zip to loop
        through multiple lists/arays with the same indices simultaneously.
        Makes a lot of sense now, I should see where I can use this to turn
        loops over indices in my code in to list comprehension that is by ele.
        '''
        # these are the functions for updating the variables of
        # dparams (momentum) and cache.
        new_cache = [
            decay * c + (1 - decay) * g * g
            for p, c, g in zip(self.params, cache, grads)
        ]
        new_dparams = [
            mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10)
            for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads)
        ]
        '''
        Using zip to create lists of tuples of the variables themselves, and
        the fuctions for updating them (cache, momentum params and params),
        where params are weights (W) and biases (b) for each layer.
        '''
        updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [
            (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams)
        ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)]

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction, more theano graph set-up with tensors
        # still no values yet in any of these. Training loop next!
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                # theano function defined above that does all the work.
                # takes the data (like feed_dict in tf). The update calcs were
                # given to it above as a list for all layers.
                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #40
0
 def negative_log_likelihood(self):
     self.prob_of_y_given_x = T.nnet.softmax(self.x)
     return -T.mean(
         T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y])
Example #41
0
    def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound,
                 settings_):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updating the critic. This can be an issue when concatenating networks together.
            The first first network becomes a part of the second. However you can still access the first
            network by itself but an updates on the second network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        super(DeepDPGDQ, self).__init__(n_in, n_out, state_bounds,
                                        action_bounds, reward_bound, settings_)

        batch_size = self.getSettings()['batch_size']
        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, self._state_length)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size,
                                                    self._state_length)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.dmatrix("Action")
        Action.tag.test_value = np.random.rand(batch_size, self._action_length)
        # create a small convolutional neural network
        inputLayerActA = lasagne.layers.InputLayer((None, self._state_length),
                                                   State)

        l_hid1ActA = lasagne.layers.DenseLayer(
            inputLayerActA,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActA = lasagne.layers.DenseLayer(
            l_hid1ActA,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActA = lasagne.layers.DenseLayer(
            l_hid2ActA,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActA = lasagne.layers.DenseLayer(
            l_hid3ActA,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerA = lasagne.layers.InputLayer((None, self._state_length),
                                                State)

        concatLayer = lasagne.layers.ConcatLayer(
            [inputLayerA, self._l_outActA])

        l_hid1A = lasagne.layers.DenseLayer(
            concatLayer,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2A = lasagne.layers.DenseLayer(
            l_hid1A,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3A = lasagne.layers.DenseLayer(
            l_hid2A,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outA = lasagne.layers.DenseLayer(
            l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((self._action_length,))

        # self.updateTargetModel()
        inputLayerActB = lasagne.layers.InputLayer((None, self._state_length),
                                                   State)

        l_hid1ActB = lasagne.layers.DenseLayer(
            inputLayerActB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActB = lasagne.layers.DenseLayer(
            l_hid1ActB,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActB = lasagne.layers.DenseLayer(
            l_hid2ActB,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActB = lasagne.layers.DenseLayer(
            l_hid3ActB,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerB = lasagne.layers.InputLayer((None, self._state_length),
                                                State)
        concatLayerB = lasagne.layers.ConcatLayer(
            [inputLayerB, self._l_outActB])

        l_hid1B = lasagne.layers.DenseLayer(
            concatLayerB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2B = lasagne.layers.DenseLayer(
            l_hid1B,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3B = lasagne.layers.DenseLayer(
            l_hid2B,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outB = lasagne.layers.DenseLayer(
            l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear)

        ################################################################################\
        inputLayerActA = lasagne.layers.InputLayer((None, self._state_length),
                                                   State)

        l_hid1ActA = lasagne.layers.DenseLayer(
            inputLayerActA,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActA = lasagne.layers.DenseLayer(
            l_hid1ActA,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActA = lasagne.layers.DenseLayer(
            l_hid2ActA,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActATarget = lasagne.layers.DenseLayer(
            l_hid3ActA,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerA = lasagne.layers.InputLayer((None, self._state_length),
                                                State)

        concatLayer = lasagne.layers.ConcatLayer(
            [inputLayerA, self._l_outActA])

        l_hid1A = lasagne.layers.DenseLayer(
            concatLayer,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2A = lasagne.layers.DenseLayer(
            l_hid1A,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3A = lasagne.layers.DenseLayer(
            l_hid2A,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outATarget = lasagne.layers.DenseLayer(
            l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((self._action_length,))

        # self.updateTargetModel()
        inputLayerActB = lasagne.layers.InputLayer((None, self._state_length),
                                                   State)

        l_hid1ActB = lasagne.layers.DenseLayer(
            inputLayerActB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActB = lasagne.layers.DenseLayer(
            l_hid1ActB,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActB = lasagne.layers.DenseLayer(
            l_hid2ActB,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActBTarget = lasagne.layers.DenseLayer(
            l_hid3ActB,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerB = lasagne.layers.InputLayer((None, self._state_length),
                                                State)
        concatLayerB = lasagne.layers.ConcatLayer(
            [inputLayerB, self._l_outActB])

        l_hid1B = lasagne.layers.DenseLayer(
            concatLayerB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2B = lasagne.layers.DenseLayer(
            l_hid1B,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3B = lasagne.layers.DenseLayer(
            l_hid2B,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outBTarget = lasagne.layers.DenseLayer(
            l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']

        self._states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(
            np.zeros((batch_size, self._action_length),
                     dtype=theano.config.floatX), )

        self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State)
        self._q_valsActB = lasagne.layers.get_output(self._l_outActB, State)
        # self._q_valsActB2 = lasagne.layers.get_output(self._l_outActB, State)

        inputs_ = {
            State: self._states_shared,
            Action: self._q_valsActA,
        }
        self._q_valsA = lasagne.layers.get_output(self._l_outA, inputs_)
        inputs_ = {
            ResultState: self._next_states_shared,
            Action: self._q_valsActB,
        }
        self._q_valsA_B = lasagne.layers.get_output(self._l_outBTarget,
                                                    inputs_)
        inputs_ = {
            State: self._states_shared,
            Action: self._q_valsActB,
        }
        self._q_valsB = lasagne.layers.get_output(self._l_outB, inputs_)
        inputs_ = {
            State: self._next_states_shared,
            Action: self._q_valsActA,
        }
        self._q_valsB_A = lasagne.layers.get_output(self._l_outATarget,
                                                    inputs_)

        self._q_func = self._q_valsA
        self._q_funcAct = self._q_valsActA
        self._q_funcB = self._q_valsB
        self._q_funcActB = self._q_valsActB2

        # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True)

        self._target = (Reward + self._discount_factor * self._q_valsA_B)
        self._diff = self._target - self._q_valsA

        self._targetB = (Reward + self._discount_factor * self._q_valsB_A)
        self._diffB = self._target - self._q_valsB

        self._loss = 0.5 * self._diff**2 + (
            self._decay_weight *
            lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2))
        self._loss = T.mean(self._loss)

        self._lossB = 0.5 * self._diffB**2 + (
            self._decay_weight *
            lasagne.regularization.regularize_network_params(
                self._l_outB, lasagne.regularization.l2))
        self._lossB = T.mean(self._lossB)

        # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16
        # Need to remove the action layers from these params
        self._params = lasagne.layers.helper.get_all_params(
            self._l_outA
        )[-len(lasagne.layers.helper.get_all_params(self._l_outActA)):]
        self._paramsB = lasagne.layers.helper.get_all_params(
            self._l_outB
        )[-len(lasagne.layers.helper.get_all_params(self._l_outActB)):]
        print("******Number of Layers is: " +
              str(len(lasagne.layers.helper.get_all_params(self._l_outA))))
        print("******Number of Action Layers is: " +
              str(len(lasagne.layers.helper.get_all_params(self._l_outActA))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._l_outActA)
        self._actionParamsB = lasagne.layers.helper.get_all_params(
            self._l_outActB)
        self._givens_ = {
            State: self._states_shared,
            # ResultState: self._next_states_shared,
            Reward: self._rewards_shared,
            # Action: self._actions_shared,
        }
        self._actGivens = {
            State: self._states_shared,
            # ResultState: self._next_states_shared,
            # Reward: self._rewards_shared,
            # Action: self._actions_shared,
        }

        # SGD update
        #updates_ = rmsprop(loss, params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        # minimize Value function error
        self._updates_ = rmsprop(
            T.mean(self._q_func) +
            (self._decay_weight *
             lasagne.regularization.regularize_network_params(
                 self._l_outA, lasagne.regularization.l2)), self._params,
            self._learning_rate * -T.mean(self._diff), self._rho,
            self._rms_epsilon)

        self._updates_B = rmsprop(
            T.mean(self._q_funcB) +
            (self._decay_weight *
             lasagne.regularization.regularize_network_params(
                 self._l_outB, lasagne.regularization.l2)), self._paramsB,
            self._learning_rate * -T.mean(self._diffB), self._rho,
            self._rms_epsilon)

        # actDiff1 = (Action - self._q_valsActB) #TODO is this correct?
        # actDiff = (actDiff1 - (Action - self._q_valsActA))
        # actDiff = ((Action - self._q_valsActB2)) # Target network does not work well here?
        #self._actDiff = ((Action - self._q_valsActA)) # Target network does not work well here?
        #self._actLoss = 0.5 * self._actDiff ** 2 + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2))
        #self._actLoss = T.mean(self._actLoss)

        # actionUpdates = rmsprop(actLoss +
        #    (1e-4 * lasagne.regularization.regularize_network_params(
        #        self._l_outActA, lasagne.regularization.l2)), actionParams,
        #            self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon)

        # Maximize wrt q function

        # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO
        self._actionUpdates = rmsprop(
            T.mean(self._q_func) +
            (self._decay_weight *
             lasagne.regularization.regularize_network_params(
                 self._l_outActA, lasagne.regularization.l2)),
            self._actionParams, self._learning_rate * 0.1, self._rho,
            self._rms_epsilon)

        self._actionUpdatesB = rmsprop(
            T.mean(self._q_funcB) +
            (self._decay_weight *
             lasagne.regularization.regularize_network_params(
                 self._l_outActB, lasagne.regularization.l2)),
            self._actionParamsB, self._learning_rate * 0.1, self._rho,
            self._rms_epsilon)

        self._train = theano.function([], [self._loss, self._q_valsA],
                                      updates=self._updates_,
                                      givens=self._givens_)
        self._trainB = theano.function([], [self._lossB, self._q_valsB],
                                       updates=self._updates_B,
                                       givens=self._givens_)
        self._trainActor = theano.function([], [self._q_valsA],
                                           updates=self._actionUpdates,
                                           givens=self._actGivens)
        self._trainActorB = theano.function([], [self._q_valsB],
                                            updates=self._actionUpdatesB,
                                            givens=self._actGivens)
        self._q_val = theano.function([],
                                      self._q_valsA,
                                      givens={State: self._states_shared})
        self._q_valB = theano.function([],
                                       self._q_valsB,
                                       givens={State: self._states_shared})
        self._q_action = theano.function([],
                                         self._q_valsActA,
                                         givens={State: self._states_shared})
        self._q_actionB = theano.function([],
                                          self._q_valsActB,
                                          givens={State: self._states_shared})
        # self._q_actionB = theano.function([], self._q_valsActB, givens={State: self._states_shared})

        inputs_ = [
            State,
            Reward,
            # ResultState
        ]
        self._bellman_error = theano.function(inputs=inputs_,
                                              outputs=self._diff,
                                              allow_input_downcast=True)
        self._bellman_errorB = theano.function(inputs=inputs_,
                                               outputs=self._diffB,
                                               allow_input_downcast=True)
Example #42
0
 def cost(self, net):
     "Return the log-likelihood cost."
     return -T.mean(
         T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
Example #43
0
    def train(self, X, Y, **kwargs):
        """
        Trains the model on the provided data.

        Parameters
        ----------
        X: np.ndarray (N, D)
            Input datapoints. The dimensionality of X is (N, D),
            with N as the number of points and D is the number of features.
        Y: np.ndarray (N, T)
            The corresponding target values.
            The dimensionality of Y is (N, T), where N has to
            match the number of points of X and T is the number of objectives
        """
        # Normalize inputs
        self.X = X
        self.X_mean = np.mean(X)
        self.X_std = np.std(X)
        self.norm_X = (X - self.X_mean) / self.X_std

        if self.X.shape[0] <= self.batch_size:
            batch_size = self.X.shape[0]
        else:
            batch_size = self.batch_size

        # Normalize ouputs
        self.Y_mean = np.mean(Y)
        self.Y_std = np.std(Y)
        self.Y = (Y - self.Y_mean) / self.Y_std
        #self.Y = Y
        start_time = time.time()

        # Create the neural network
        features = X.shape[1]

        self.learning_rate = theano.shared(
            np.array(self.init_learning_rate, dtype=theano.config.floatX))
        self.network = self._build_net(self.input_var, features)

        prediction = lasagne.layers.get_output(self.network)

        # Define loss function for training
        loss = T.mean(T.square(prediction - self.target_var)) / 0.001

        # Add l2 regularization for the weights
        l2_penalty = self.l2 * lasagne.regularization.regularize_network_params(
            self.network, lasagne.regularization.l2)
        loss += l2_penalty
        loss = loss.mean()

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=self.learning_rate)

        logging.debug("... compiling theano functions")
        self.train_fn = theano.function([self.input_var, self.target_var],
                                        loss,
                                        updates=updates,
                                        allow_input_downcast=True)

        # Start training
        lc = np.zeros([self.num_epochs])
        for epoch in range(self.num_epochs):

            epoch_start_time = time.time()

            # Full pass over the training data:
            train_err = 0
            train_batches = 0

            for batch in self.iterate_minibatches(self.norm_X,
                                                  self.Y,
                                                  batch_size,
                                                  shuffle=True):
                inputs, targets = batch
                train_err += self.train_fn(inputs, targets)
                train_batches += 1

            lc[epoch] = train_err / train_batches
            logging.debug("Epoch {} of {}".format(epoch + 1, self.num_epochs))
            curtime = time.time()
            epoch_time = curtime - epoch_start_time
            total_time = curtime - start_time
            logging.debug("Epoch time {:.3f}s, "
                          "total time {:.3f}s".format(epoch_time, total_time))
            logging.debug("Training loss:\t\t{:.5g}".format(train_err /
                                                            train_batches))

            #Adapt the learning rate
            if epoch % self.adapt_epoch == 0:
                self.learning_rate.set_value(
                    np.float32(self.init_learning_rate * 0.1))

        # Design matrix
        layers = lasagne.layers.get_all_layers(self.network)
        self.Theta = lasagne.layers.get_output(layers[:-1],
                                               self.norm_X)[-1].eval()

        if self.do_optimize:
            if self.do_mcmc:
                self.sampler = emcee.EnsembleSampler(
                    self.n_hypers, 2, self.marginal_log_likelihood)

                # Do a burn-in in the first iteration
                if not self.burned:
                    # Initialize the walkers by sampling from the prior
                    self.p0 = self.prior.sample_from_prior(self.n_hypers)
                    # Run MCMC sampling
                    self.p0, _, _ = self.sampler.run_mcmc(
                        self.p0, self.burnin_steps)

                    self.burned = True

                # Start sampling
                pos, _, _ = self.sampler.run_mcmc(self.p0, self.chain_length)

                # Save the current position, it will be the startpoint in
                # the next iteration
                self.p0 = pos

                # Take the last samples from each walker
                self.hypers = np.exp(self.sampler.chain[:, -1])
            else:
                # Optimize hyperparameters of the Bayesian linear regression
                res = optimize.fmin(self.nll, np.random.rand(2))
                self.hypers = [[np.exp(res[0]), np.exp(res[1])]]
        else:

            self.hypers = [[self.alpha, self.beta]]

        logging.info("Hypers: %s" % self.hypers)
        self.models = []
        for sample in self.hypers:

            # Instantiate a model for each hyperparameter configuration
            model = BayesianLinearRegression(alpha=sample[0],
                                             beta=sample[1],
                                             basis_func=None)
            model.train(self.Theta, self.Y, do_optimize=False)

            self.models.append(model)
def build_objective(model, deterministic=False, epsilon=1e-12):
    p = nn.layers.get_output(model.l_out, deterministic=deterministic)
    targets = T.flatten(nn.layers.get_output(model.l_target))
    p = T.clip(p, epsilon, 1.-epsilon)
    bce = T.nnet.binary_crossentropy(p, targets)
    return T.mean(bce)
Example #45
0
    def __init__(self, rng, input, model_params, self_norm_coeff, activation,
                 dropout, is_test):
        (self.ngram_size, linear_W_emb, hidden_Ws, hidden_bs, softmax_W,
         softmax_b) = model_params

        (in_vocab_size, emb_dim) = linear_W_emb.shape
        (softmax_in, softmax_out) = softmax_W.shape
        context_size = self.ngram_size - 1

        self.emb_dim = emb_dim
        self.in_vocab_size = in_vocab_size

        # linear embeding layer
        sys.stderr.write(
            '# linear layer: in_vocab_size=%d, emb_dim=%d, context_size=%d\n' %
            (in_vocab_size, emb_dim, context_size))
        self.linearLayer = LinearLayer(rng, input, emb_dim, context_size,
                                       in_vocab_size, linear_W_emb)

        # hidden layers
        self.hidden_layers = []
        cur_hidden_in = emb_dim * context_size
        self.num_hidden_layers = len(hidden_Ws)
        sys.stderr.write('# hidden layers=%d\n' % self.num_hidden_layers)
        hidden_params = []
        prev_layer = self.linearLayer
        for ii in xrange(self.num_hidden_layers):
            hidden_W = hidden_Ws[ii]
            hidden_b = hidden_bs[ii]
            (hidden_in, hidden_out) = hidden_W.shape
            assert cur_hidden_in == hidden_in, '! hidden layer %d: cur_hidden_in %d != hidden_in %d\n' % (
                ii + 1, cur_hidden_in, hidden_in)

            sys.stderr.write(
                '  hidden layer %d: hidden_in=%d, hidden_out=%d\n' %
                (ii + 1, hidden_in, hidden_out))
            hidden_layer = HiddenLayer(rng, prev_layer.output, hidden_in,
                                       hidden_out, activation, hidden_W,
                                       hidden_b, dropout)
            self.hidden_layers.append(hidden_layer)
            hidden_params = hidden_params + hidden_layer.params

            cur_hidden_in = hidden_out
            prev_layer = hidden_layer

        # softmax
        assert cur_hidden_in == softmax_in, '! softmax layer: cur_hidden_in %d != softmax_in %d\n' % (
            ii + 1, cur_hidden_in, softmax_in)
        sys.stderr.write('# softmax layer: softmax_in=%d, softmax_out=%d\n' %
                         (softmax_in, softmax_out))
        self.softmaxLayer = SoftmaxLayer(
            self.hidden_layers[self.num_hidden_layers - 1].output, softmax_W,
            softmax_b, self_norm_coeff, is_test)

        # L1
        #self.L1 = abs(self.hidden_layer.W).sum() + abs(self.softmaxLayer.W).sum()

        # L2
        #self.L2 = (self.hidden_layer.W ** 2).sum() + (self.softmaxLayer.W ** 2).sum()

        # nll
        self.nll = self.softmaxLayer.nll

        # sum_ll
        self.sum_ll = self.softmaxLayer.sum_ll

        # sum_ll
        if is_test == 1:
            self.ind_ll = self.softmaxLayer.ind_ll

        if is_test == 0 and self_norm_coeff > 0:
            self.mean_abs_log_norm = T.mean(
                T.abs_(self.softmaxLayer.log_norm
                       ))  # to observe how much we compressed log |Z(x)|
            self.mean_square_log_norm = T.mean(
                self.softmaxLayer.log_norm**
                2)  # for cost function (log Z(x))^2

        # params
        self.params = self.linearLayer.params + hidden_params + self.softmaxLayer.params
Example #46
0
def main(train_file,
         val_file,
         savename,
         modelFile,
         num_epochs=500,
         alpha=0.1,
         margin=25,
         base=0.01,
         mb_size=50,
         momentum=0.9,
         synsets=None):
    print("Loading data...")
    print('Alpha: %f' % (alpha, ))
    print('Save name: %s' % (savename, ))
    tr_addresses, tr_labels = get_traindata(train_file, synsets)
    vl_addresses, vl_labels = get_valdata(val_file)
    N = len(tr_addresses)
    print('Num training examples: %i' % (N, ))
    print('Alpha/N: %e' % (alpha / N, ))
    # Variables
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')
    learning_rate = T.fscalar('learning_rate')
    im_shape = (227, 227)
    max_grad = 1.
    print("Building model and compiling functions...")
    network = build_cnn(im_shape, input_var=input_var)
    # Losses and updates
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean() + regularization(prediction, alpha / N).mean()
    params = lasagne.layers.get_all_params(network, deterministic=False)
    #updates = lasagne.updates.nesterov_momentum(loss, params,
    #                                learning_rate=learning_rate,
    #                                momentum=momentum)
    updates = clipped_nesterov_momentum(loss,
                                        params,
                                        learning_rate,
                                        max_grad,
                                        momentum=momentum)
    # Validation and testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var),
                       dtype=theano.config.floatX)
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)
    # Theano functions
    train_fn = theano.function([input_var, target_var, learning_rate],
                               [loss, train_acc],
                               updates=updates)
    val_fn = theano.function([input_var, target_var], test_acc)
    print("Starting training...")
    # We iterate over epochs:
    start_time = time.time()
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        learning_rate = get_learning_rate(epoch, margin, base)
        train_err = 0
        train_batches = 0
        running_error = []
        running_acc = []
        acc = 0.
        trdlg = data_and_label_generator(tr_addresses,
                                         tr_labels,
                                         im_shape,
                                         mb_size,
                                         shuffle=True)
        for batch in threaded_gen(trdlg, num_cached=500):
            inputs, targets = batch
            local_train_err, local_train_acc = train_fn(
                inputs, targets, learning_rate)
            train_err += local_train_err
            acc += local_train_acc
            train_batches += 1
            if np.isnan(local_train_err):
                sys.exit()
            running_error.append(local_train_err)
            running_acc.append(local_train_acc)
            if train_batches % 257 == 0:
                save_errors(savename, running_error, err_type='error')
                save_errors(savename, running_acc, err_type='acc')
                running_error = []
                running_acc = []
            h, m, s = theTime(start_time)
            sys.stdout.write(
                'Time: %d:%02d:%02d Minibatch: %i Training Error: %f\r' %
                (h, m, s, train_batches, train_err / train_batches)),
            sys.stdout.flush()
        print
        val_acc = 0
        val_batches = 0
        running_val_acc = []
        vldlg = data_and_label_generator(vl_addresses, vl_labels, im_shape,
                                         mb_size)
        for batch in threaded_gen(vldlg, num_cached=50):
            inputs, targets = batch
            val_acc += val_fn(inputs, targets)
            val_batches += 1
            sys.stdout.write('Minibatch: %i Validation Accuracy: %f\r' %
                             (val_batches, val_acc / val_batches * 100)),
            sys.stdout.flush()
        running_val_acc.append(val_acc / val_batches)
        save_errors(savename, running_val_acc, err_type='val_acc')
        print
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  train loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  valid acc:\t\t{:.6f}".format(val_acc / val_batches * 100.))
        save_model(network, modelFile)
Example #47
0
 def L2_cost(self, y):
     L = T.sum((self.output - y)**2, axis=1)
     return T.mean(L)
Example #48
0
 def nll(self, y):
     """
 Mean negative log-lilelihood
 """
     return -T.mean(self.log_p_y_given_x[T.arange(y.shape[0]), y])
Example #49
0
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        # make them theano shared
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo
        ]

        thX = T.ivector('X')
        Ei = self.We[thX]  # will be a TxD matrix
        thY = T.ivector('Y')

        # sentence input:
        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei,
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        n_total = sum((len(sentence) + 1) for sentence in X)
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short
                # we will try to fix in a later iteration
                # BAD! magic numbers 0 and 1...
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) /
                                                            n_total)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Example #50
0
 def softmax_cost(self, y):
     L = -T.sum(y * T.log(self.output) + (1 - y) * T.log(1 - self.output),
                axis=1)
     return T.mean(L)
Example #51
0
print l_last_hid.name, lasagne.layers.get_output(l_last_hid, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(
    {x_sym: Tdata, xmask_sym: Tmask}).shape

l_softmax = lasagne.layers.DenseLayer(l_last_hid, num_units=NUM_CLASS,
                                      nonlinearity=lasagne.nonlinearities.softmax,
                                      name='SoftmaxOutput')

print l_softmax.name, lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval(
    {x_sym: Tdata, xmask_sym: Tmask}).shape
print lasagne.layers.count_params(l_softmax)

output_train = lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, deterministic=False)

#cost function
total_cost = T.nnet.categorical_crossentropy(output_train, y_sym.flatten())
mean_cost = T.mean(total_cost)

#accuracy function
argmax = T.argmax(output_train, axis=-1)
eq = T.eq(argmax,y_sym)
acc = T.mean(eq)

all_parameters = lasagne.layers.get_all_params([l_softmax], trainable=True)

print "Trainable Model Parameters"
print "-"*40
for param in all_parameters:
    print param, param.get_value().shape
print "-"*40

all_grads = T.grad(mean_cost, all_parameters)
Example #52
0
X_te = X[test_ix]
y_te = y[test_ix]

output_layer = DenseLayer(net['fc7'],
                          num_units=len(CLASSES),
                          nonlinearity=softmax)

X_sym = T.tensor4()
y_sym = T.ivector()

prediction = lasagne.layers.get_output(output_layer, X_sym)
loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym)
loss = loss.mean()

acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym),
             dtype=theano.config.floatX)

params = lasagne.layers.get_all_params(output_layer, trainable=True)
updates = lasagne.updates.nesterov_momentum(loss,
                                            params,
                                            learning_rate=0.0001,
                                            momentum=0.9)

train_fn = theano.function([X_sym, y_sym], loss, updates=updates)
val_fn = theano.function([X_sym, y_sym], [loss, acc])
pred_fn = theano.function([X_sym], prediction)


def batches(iterable, N):
    chunk = []
    for item in iterable:
Example #53
0
    network = lasagne.layers.DenseLayer(
        convpool,
        num_units=nb_classes,
        nonlinearity=lasagne.nonlinearities.softmax)

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adadelta(loss, params, learning_rate=1.0)

    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    ##compilation
    # train_fn = theano.function([l_in.input_var, target_var,l_mask.input_var], loss, updates=updates)
    train_fn = theano.function([l_in.input_var, target_var],
                               loss,
                               updates=updates)

    val_fn = theano.function([l_in.input_var, target_var],
                             [test_loss, test_acc])
    pred_fn = theano.function([l_in.input_var], test_prediction)
    patience = 30
    best_valid = 0
    best_valid_epoch = 0
    best_weights = None
    train_history = {}
def train_and_eval_gauss_logit_SB_VAE(
    dataset,
    hidden_layer_sizes,
    hidden_layer_types,
    latent_size,
    activations,
    prior_mu,
    prior_sigma,
    n_epochs,
    batch_size,
    lookahead,
    adam_lr,
    experiment_dir,
    output_file_base_name,
    random_seed):

    rng = np.random.RandomState(random_seed)

    # LOAD DATA
    if "mnist_plus_rot" in dataset:
        datasets = load_mnist_w_rotations(dataset, target_as_one_hot=True, flatten=False, split=(70000, 10000, 20000))
        input_layer_size = 28*28
        layer_sizes = [input_layer_size] + hidden_layer_sizes
        out_activation = Sigmoid
        neg_log_likelihood_fn = calc_binaryVal_negative_log_likelihood
        print "Dataset: MNIST+rot"
    elif "mnist" in dataset:
        # We follow the approach used in [2] to split the MNIST dataset.
        datasets = load_mnist(dataset, target_as_one_hot=True, flatten=True, split=(45000, 5000, 10000))
        input_layer_size = 28*28
        layer_sizes = [input_layer_size] + hidden_layer_sizes
        out_activation = Sigmoid
        neg_log_likelihood_fn = calc_binaryVal_negative_log_likelihood
        print "Dataset: MNIST"
    elif "svhn_pca" in dataset:
        datasets = load_svhn_pca(dataset, target_as_one_hot=True, train_valid_split=(65000, 8257))
        input_layer_size = 500
        layer_sizes = [input_layer_size] + hidden_layer_sizes
        out_activation = Identity
        neg_log_likelihood_fn = calc_realVal_negative_log_likelihood
        print "Dataset: SVHN (PCA reduced)"
    else:
        print "no data found..."
        exit()

    train_set_x, _ = datasets[0]
    valid_set_x, _ = datasets[1]
    test_set_x, _ = datasets[2]

    train_set_size = int(train_set_x.shape[0].eval())
    valid_set_size = int(valid_set_x.shape[0].eval())
    test_set_size = int(test_set_x.shape[0].eval())
    print 'Datasets loaded ({:,} train | {:,} valid | {:,} test)'.format(train_set_size, valid_set_size, test_set_size)

    # compute number of minibatches for training, validation and testing
    n_train_batches =  train_set_size / batch_size
    n_test_batches = test_set_size / batch_size
    n_valid_batches = valid_set_size / batch_size

    # BUILD MODEL
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x = T.matrix('x')
    # construct the Gaussian Variational Autoencoder
    model = Gauss_Logit_SB_VAE(rng=rng, input=x, batch_size=batch_size, layer_sizes=layer_sizes, layer_types=hidden_layer_types,
                         activations=activations, latent_size=latent_size, out_activation=out_activation)

    # Build the expresson for the cost function.
    data_ll_term = neg_log_likelihood_fn(x, model.x_recon)
    kl = model.calc_kl_divergence(prior_mu=prior_mu, prior_sigma=prior_sigma)

    # Compose into final costs
    cost = T.mean( data_ll_term + kl )

    updates = get_adam_updates(cost=cost, params=model.params, lr=adam_lr)

    # Compile theano function for testing.
    test_model = theano.function(
        inputs = [index],
        outputs = T.mean(neg_log_likelihood_fn(x, model.x_recon)),
        givens = {x: test_set_x[index * batch_size:(index + 1) * batch_size]})

    # Compile theano function for validation.
    valid_model = theano.function(
        inputs = [index],
        outputs = T.mean(neg_log_likelihood_fn(x, model.x_recon)),
        givens = {x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    # Compile theano function for training.
    train_model = theano.function(
        inputs = [index],
        outputs = [data_ll_term.mean(), kl.mean()],
        updates = updates,
        givens = {x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    # TRAIN MODEL #
    print 'Training for {} epochs ...'.format(n_epochs)

    best_params = None
    best_valid_error = np.inf
    best_iter = 0
    start_time = time.clock()
    results_file_name = pjoin(experiment_dir, "gauss_logit_SB_VAE_results_.txt")
    results_file = open(results_file_name, 'w')

    stop_training = False
    for epoch_counter in range(n_epochs):
        if stop_training:
            break

        # Train this epoch
        epoch_start_time = time.time()
        avg_training_nll_tracker = 0.
        avg_training_kl_tracker = 0.

        for minibatch_index in xrange(n_train_batches):
            avg_training_nll, avg_training_kl = train_model(minibatch_index)

            # check for NaN, test model anyway even if one is detected
            if (np.isnan(avg_training_nll) or np.isnan(avg_training_kl)):
                print "found NaN...aborting training..."
                results_file.write("found NaN...aborting training... \n\n")
                if epoch_counter > 0:
                    for param, best_param in zip(model.params, best_params):
                        param.set_value(best_param)
                    test_error = sum([test_model(i) for i in xrange(n_test_batches)]) / n_test_batches
                    results = "Ended due to NaN! best epoch {}, best valid error {:.4f}, test error {:.4f}, training time {:.2f}m"
                    results = results.format(best_iter, best_valid_error, test_error, (end_time-start_time)/60)
                    print results
                    results_file.write(results + "\n")
                results_file.close()
                exit()

            avg_training_nll_tracker += avg_training_nll
            avg_training_kl_tracker += avg_training_kl

        epoch_end_time = time.time()

        # Compute some infos about training.
        avg_training_nll_tracker /= (minibatch_index+1)
        avg_training_kl_tracker /= (minibatch_index+1)

        # Compute validation error
        valid_error = sum([valid_model(i) for i in xrange(n_valid_batches)])/n_valid_batches

        results = "epoch {}, training loss (NLL) {:.4f}, training kl divergence {:.4f}, valid error {:.4f}, time {:.2f} "

        if valid_error < best_valid_error:
            best_iter = epoch_counter
            best_valid_error = valid_error
            results += " ***"
            # Save progression
            best_params = [param.get_value().copy() for param in model.params]
            #cPickle.dump(best_params, open(pjoin(experiment_dir, 'gauss_vae_params_'+output_file_base_name+'.pkl'), 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump(best_params, open(pjoin(experiment_dir, 'gauss_logit_SB_VAE_params_.pkl'), 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
        elif epoch_counter-best_iter > lookahead:
            stop_training = True

        # Report and save progress.
        results = results.format(epoch_counter, avg_training_nll_tracker, avg_training_kl_tracker, valid_error, (epoch_end_time-epoch_start_time)/60)
        print results
        results_file.write(results + "\n")
        results_file.flush()

    end_time = time.clock()

    # Reload best model.
    for param, best_param in zip(model.params, best_params):
        param.set_value(best_param)

    # Compute test error on best epoch
    test_error = sum([test_model(i) for i in xrange(n_test_batches)])/n_test_batches

    results = "Done! best epoch {}, best valid error {:.4f}, test error {:.4f}, training time {:.2f}m"
    results = results.format(best_iter, best_valid_error, test_error, (end_time-start_time)/60)
    print results
    results_file.write(results + "\n")
    results_file.close()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #55
0
 def get_cost(self, y):
     # cross-entropy loss
     L = -T.mean(y * T.log(self.p_y_given_x) +
                 (1 - y) * T.log(1 - self.p_y_given_x))
     return L
    def build_functions(self, train=False, debug=False, logger=logger_RNNtools):

        # LSTM in lasagne: see https://github.com/craffel/Lasagne-tutorial/blob/master/examples/recurrent.py
        # and also         http://colinraffel.com/talks/hammer2015recurrent.pdf
        target_var = self.audio_targets_var #T.imatrix('audio_targets')

        # if debug:  import pdb; self.print_network_structure()


        network_output = L.get_output(self.network_lout_batch)
        network_output_flattened = L.get_output(self.network_lout)  # (batch_size * batch_max_seq_length, nb_phonemes)

        # compare targets with highest output probability. Take maximum of all probs (3rd axis (index 2) of output: 1=batch_size (input files), 2 = time_seq (frames), 3 = n_features (phonemes)
        # network_output.shape = (len(X), 39) -> (nb_inputs, nb_classes)
        predictions = (T.argmax(network_output, axis=2))
        if debug:
            self.predictions_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], predictions,
                                              name='predictions_fn')

        if debug:
            predicted = self.predictions_fn(self.X, self.masks)
            logger.debug('predictions_fn(X).shape: %s', predicted.shape)
            # logger.debug('predictions_fn(X)[0], value: %s', predicted[0])

        if debug:
            self.output_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], network_output, name='output_fn')
            n_out = self.output_fn(self.X, self.masks)
            logger.debug('network_output.shape: \t%s', n_out.shape);
            # logger.debug('network_output[0]:     \n%s', n_out[0]);

        # # Function to determine the number of correct classifications
        # which video, and which frames in the video
        valid_indices_example, valid_indices_seqNr = self.audio_masks_var.nonzero()
        valid_indices_fn = theano.function([self.audio_masks_var], [valid_indices_example, valid_indices_seqNr], name='valid_indices_fn')

        # this gets a FLATTENED array of all the valid predictions of all examples of this batch (so not one row per example)
        # if you want to get the valid predictions per example, you need to use the valid_frames list (it tells you the number of valid frames per wav, so where to split this valid_predictions array)
        # of course this is trivial for batch_size_audio = 1, as all valid_predictions will belong to the one input wav
        valid_predictions = predictions[valid_indices_example, valid_indices_seqNr]
        valid_targets = target_var[valid_indices_example, valid_indices_seqNr]
        self.valid_targets_fn = theano.function([self.audio_masks_var, target_var], valid_targets, name='valid_targets_fn')
        self.valid_predictions_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], valid_predictions, name='valid_predictions_fn')


        # get valid network output
        valid_network_output = network_output[valid_indices_example, valid_indices_seqNr]
        if debug:
            self.valid_network_output_fn = theano.function([self.audio_inputs_var, self.audio_masks_var],
                                                       valid_network_output)

        # Functions for computing cost and training
        top1_acc = T.mean(lasagne.objectives.categorical_accuracy(valid_network_output, valid_targets, top_k=1))
        self.top1_acc_fn = theano.function(
                [self.audio_inputs_var, self.audio_masks_var, self.audio_targets_var], top1_acc)
        top3_acc = T.mean(lasagne.objectives.categorical_accuracy(valid_network_output, valid_targets, top_k=3))
        self.top3_acc_fn = theano.function(
                [self.audio_inputs_var, self.audio_masks_var, self.audio_targets_var], top3_acc)


        # # using the lasagne SliceLayer:
        # # !!!! only works with batch_size == 1  !!!!
        #
        # valid_network_output2 = L.get_output(self.network['l7_out_valid'])
        # self.valid_network_fn = theano.function([self.audio_inputs_var, self.audio_masks_var,
        #                                          self.audio_valid_indices_var], valid_network_output2)
        # valid_network_output_flattened = L.get_output(self.network_lout_valid_flattened)
        #
        # valid_predictions2 = T.argmax(valid_network_output2,axis=2)
        # self.valid_predictions2_fn = theano.function(
        #         [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var],
        #         valid_predictions2, name='valid_predictions_fn')
        #
        # # Functions for computing cost and training
        # top1_acc = T.mean(lasagne.objectives.categorical_accuracy(
        #         valid_network_output_flattened, valid_targets.flatten(), top_k=1))
        # self.top1_acc_fn = theano.function(
        #         [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var,
        #          self.audio_targets_var], top1_acc)
        # top3_acc = T.mean(lasagne.objectives.categorical_accuracy(
        #         valid_network_output_flattened, valid_targets.flatten(), top_k=3))
        # self.top3_acc_fn = theano.function(
        #         [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var,
        #          self.audio_targets_var], top3_acc)


        if debug:
            try:
                # only works with batch_size == 1
                # valid_preds2 = self.valid_predictions2_fn(self.X, self.masks, self.valid_frames)
                # logger.debug("all valid predictions of this batch: ")
                # logger.debug('valid_preds2.shape: %s', valid_preds2.shape)
                # logger.debug('valid_preds2, value: \n%s', valid_preds2)

                # valid_out = self.valid_network_fn(self.X, self.masks, self.valid_frames)
                # logger.debug('valid_out.shape: %s', valid_out.shape)
                # # logger.debug('valid_out, value: \n%s', valid_out)

                valid_example, valid_seqNr = valid_indices_fn(self.masks)
                logger.debug('valid_inds(masks).shape: %s', valid_example.shape)

                valid_output = self.valid_network_output_fn(self.X, self.masks)
                logger.debug("all valid outputs of this batch: ")
                logger.debug('valid_output.shape: %s', valid_output.shape)

                valid_preds = self.valid_predictions_fn(self.X, self.masks)
                logger.debug("all valid predictions of this batch: ")
                logger.debug('valid_preds.shape: %s', valid_preds.shape)
                logger.debug('valid_preds, value: \n%s', valid_preds)

                valid_targs = self.valid_targets_fn(self.masks, self.Y)
                logger.debug('valid_targets.shape: %s', valid_targs.shape)
                logger.debug('valid_targets, value: \n%s', valid_targs)

                top1 = self.top1_acc_fn(self.X, self.masks, self.Y)
                logger.debug("top 1 accuracy: %s", top1*100.0)

                top3 = self.top3_acc_fn(self.X, self.masks, self.Y)
                logger.debug("top 3 accuracy: %s", top3*100.0)

            except Exception as error:
                print('caught this error: ' + traceback.format_exc());
                import pdb; pdb.set_trace()
            #pdb.set_trace()


        ## from https://groups.google.com/forum/#!topic/lasagne-users/os0j3f_Th5Q
        # Pad your vector of labels and then mask the cost:
        # It's important to pad the label vectors with something valid such as zeros,
        # since they will still have to give valid costs that can be multiplied by the mask.
        # The shape of predictions, targets and mask should match:
        # (predictions as (batch_size*max_seq_len, n_features), the other two as (batch_size*max_seq_len,)) -> we need to get the flattened output of the network for this


        # this works, using theano masks
        cost_pointwise = lasagne.objectives.categorical_crossentropy(network_output_flattened, target_var.flatten())
        cost = lasagne.objectives.aggregate(cost_pointwise, self.audio_masks_var.flatten())
        weight_decay = 1e-5
        weightsl2 = lasagne.regularization.regularize_network_params(self.network_lout, lasagne.regularization.l2)
        cost += weight_decay * weightsl2

        self.validate_fn = theano.function([self.audio_inputs_var, self.audio_masks_var,
                                            self.audio_targets_var],
                                      [cost, top1_acc, top3_acc], name='validate_fn')
        self.cost_pointwise_fn = theano.function([self.audio_inputs_var, self.audio_masks_var, target_var],
                                            cost_pointwise, name='cost_pointwise_fn')


        if debug:
            logger.debug('cost pointwise: %s', self.cost_pointwise_fn(self.X, self.masks, self.Y))

            try:evaluate_cost = self.validate_fn(self.X, self.masks, self.Y)
            except:
                print('caught this error: ' + traceback.format_exc()); pdb.set_trace()
            logger.debug('cost:     {:.3f}'.format(float(evaluate_cost[0])))
            logger.debug('accuracy: {:.3f}'.format(float(evaluate_cost[1]*100.0)))
            logger.debug('top 3 accuracy: {:.3f}'.format(float(evaluate_cost[2]*100.0)))

            #pdb.set_trace()

        if train:
            LR = T.scalar('LR', dtype=theano.config.floatX)
            # Retrieve all trainable parameters from the network
            all_params = L.get_all_params(self.network_lout, trainable=True)
            self.updates = lasagne.updates.adam(loss_or_grads=cost, params=all_params, learning_rate=LR)
            self.train_fn = theano.function([self.audio_inputs_var, self.audio_masks_var,
                                             target_var, LR],
                                       [cost, top1_acc, top3_acc], updates=self.updates, name='train_fn')
Example #57
0
    def train(self, savefile, task, recover=True):
        """
        Train the RNN.

        Parameters
        ----------

        savefile : str

        task : function

        recover : bool, optional
                  If `True`, will attempt to recover from a previously saved run.

        """
        N     = self.p['N']
        Nin   = self.p['Nin']
        Nout  = self.p['Nout']
        alpha = self.p['dt']/self.p['tau']

        # Initialize settings
        settings = OrderedDict()

        # Check if file already exists
        if not recover:
            if os.path.isfile(savefile):
                os.remove(savefile)

        #---------------------------------------------------------------------------------
        # Are we using GPUs?
        #---------------------------------------------------------------------------------

        if theanotools.get_processor_type() == 'gpu':
            settings['GPU'] = 'enabled'
        else:
            settings['GPU'] = 'no'

        #---------------------------------------------------------------------------------
        # Random number generator
        #---------------------------------------------------------------------------------

        settings['init seed'] = self.p['seed']
        rng = np.random.RandomState(self.p['seed'])

        #---------------------------------------------------------------------------------
        # Weight initialization
        #---------------------------------------------------------------------------------

        settings['distribution (Win)']  = self.p['distribution_in']
        settings['distribution (Wrec)'] = self.p['distribution_rec']
        settings['distribution (Wout)'] = self.p['distribution_out']

        if Nin > 0:
            Win_0 = self.init_weights(rng, self.p['Cin'], N, Nin,
                                      self.p['distribution_in'])
        Wrec_0 = self.init_weights(rng, self.p['Crec'],
                                   N, N, self.p['distribution_rec'])
        Wout_0 = self.init_weights(rng, self.p['Cout'],
                                   Nout, N, self.p['distribution_out'])

        #---------------------------------------------------------------------------------
        # Enforce Dale's law on the initial weights
        #---------------------------------------------------------------------------------

        settings['Nin/N/Nout'] = '{}/{}/{}'.format(Nin, N, Nout)

        if self.p['ei'] is not None:
            Nexc = len(np.where(self.p['ei'] > 0)[0])
            Ninh = len(np.where(self.p['ei'] < 0)[0])
            settings['Dale\'s law'] = 'E/I = {}/{}'.format(Nexc, Ninh)

            if Nin > 0:
                Win_0 = abs(Win_0) # If Dale, assume inputs are excitatory
            Wrec_0 = abs(Wrec_0)
            Wout_0 = abs(Wout_0)
        else:
            settings['Dale\'s law'] = 'no'

        #---------------------------------------------------------------------------------
        # Fix spectral radius
        #---------------------------------------------------------------------------------

        # Compute spectral radius
        C = self.p['Crec']
        if C is not None:
            Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed
        else:
            Wrec_0_full = Wrec_0
        if self.p['ei'] is not None:
            Wrec_0_full = Wrec_0_full*self.p['ei']
        rho = RNN.spectral_radius(Wrec_0_full)

        # Scale Wrec to have fixed spectral radius
        if self.p['ei'] is not None:
            R = self.p['rho0']/rho
        else:
            R = 1.1/rho
        Wrec_0 *= R
        if C is not None:
            C.mask_fixed *= R

        # Check spectral radius
        if C is not None:
            Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed
        else:
            Wrec_0_full = Wrec_0
        if self.p['ei'] is not None:
            Wrec_0_full = Wrec_0_full*self.p['ei']
        rho = RNN.spectral_radius(Wrec_0_full)
        settings['initial spectral radius'] = '{:.2f}'.format(rho)

        #---------------------------------------------------------------------------------
        # Others
        #---------------------------------------------------------------------------------

        brec_0 = self.p['brec']*np.ones(N)
        bout_0 = self.p['bout']*np.ones(Nout)
        x0_0   = self.p['x0']*np.ones(N)

        #---------------------------------------------------------------------------------
        # RNN parameters
        #---------------------------------------------------------------------------------

        if Nin > 0:
            Win = theanotools.shared(Win_0, name='Win')
        else:
            Win = None
        Wrec = theanotools.shared(Wrec_0, name='Wrec')
        Wout = theanotools.shared(Wout_0, name='Wout')
        brec = theanotools.shared(brec_0, name='brec')
        bout = theanotools.shared(bout_0, name='bout')
        x0   = theanotools.shared(x0_0,   name='x0')

        #---------------------------------------------------------------------------------
        # Parameters to train
        #---------------------------------------------------------------------------------

        trainables = []
        if Win is not None:
            trainables += [Win]
        trainables += [Wrec]
        if Wout is not None:
            trainables += [Wout]

        if self.p['train_brec']:
            settings['train recurrent bias'] = 'yes'
            trainables += [brec]
        else:
            settings['train recurrent bias'] = 'no'

        if self.p['train_bout']:
            settings['train output bias'] = 'yes'
            trainables += [bout]
        else:
            settings['train output bias'] = 'no'

        # In continuous mode it doesn't make sense to train x0, which is forgotten
        if self.p['mode'] == 'continuous':
            self.p['train_x0'] = False

        if self.p['train_x0']:
            settings['train initial conditions'] = 'yes'
            trainables += [x0]
        else:
            settings['train initial conditions'] = 'no'

        #---------------------------------------------------------------------------------
        # Weight matrices
        #---------------------------------------------------------------------------------

        # Input
        if Nin > 0:
            if self.p['Cin'] is not None:
                C = self.p['Cin']
                settings['sparseness (Win)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                                .format(C.p, C.p_plastic))

                Cin_mask_plastic = theanotools.shared(C.mask_plastic)
                Cin_mask_fixed   = theanotools.shared(C.mask_fixed)

                Win_ = Cin_mask_plastic*Win + Cin_mask_fixed
                Win_.name = 'Win_'
            else:
                Win_ = Win

        # Recurrent
        if self.p['Crec'] is not None:
            C = self.p['Crec']
            settings['sparseness (Wrec)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                             .format(C.p, C.p_plastic))

            Crec_mask_plastic = theanotools.shared(C.mask_plastic)
            Crec_mask_fixed   = theanotools.shared(C.mask_fixed)

            Wrec_ = Crec_mask_plastic*Wrec + Crec_mask_fixed
            Wrec_.name = 'Wrec_'
        else:
            Wrec_ = Wrec

        # Output
        if self.p['Cout'] is not None:
            C = self.p['Cout']
            settings['sparseness (Wout)'] = ('p = {:.2f}, p_plastic = {:.2f}'
                                             .format(C.p, C.p_plastic))

            Cout_mask_plastic = theanotools.shared(C.mask_plastic)
            Cout_mask_fixed   = theanotools.shared(C.mask_fixed)

            Wout_ = Cout_mask_plastic*Wout + Cout_mask_fixed
            Wout_.name = 'Wout_'
        else:
            Wout_ = Wout

        #---------------------------------------------------------------------------------
        # Dale's law
        #---------------------------------------------------------------------------------

        if self.p['ei'] is not None:
            # Function to keep matrix elements positive
            if self.p['ei_positive_func'] == 'abs':
                settings['E/I positivity function'] = 'absolute value'
                make_positive = abs
            elif self.p['ei_positive_func'] == 'rectify':
                settings['E/I positivity function'] = 'rectify'
                make_positive = theanotools.rectify
            else:
                raise ValueError("Unknown ei_positive_func.")

            # Assume inputs are excitatory
            if Nin > 0:
                Win_ = make_positive(Win_)

            # E/I
            ei    = theanotools.shared(self.p['ei'], name='ei')
            Wrec_ = make_positive(Wrec_)*ei

            Wout_ = make_positive(Wout_)*ei

        #---------------------------------------------------------------------------------
        # Variables to save
        #---------------------------------------------------------------------------------

        if Nin > 0:
            save_values = [Win_]
        else:
            save_values = [None]
        save_values += [Wrec_, Wout_, brec, bout, x0]

        #---------------------------------------------------------------------------------
        # Activation functions
        #---------------------------------------------------------------------------------

        f_hidden, d_f_hidden = theanotools.hidden_activations[self.p['hidden_activation']]
        settings['hidden activation'] = self.p['hidden_activation']

        act = self.p['output_activation']
        f_output = theanotools.output_activations[act]

        if act == 'sigmoid':
            settings['output activation/loss'] = 'sigmoid/binary cross entropy'
            f_loss = theanotools.binary_crossentropy
        elif act == 'softmax':
            settings['output activation/loss'] = 'softmax/categorical cross entropy'
            f_loss = theanotools.categorical_crossentropy
        else:
            settings['output activation/loss'] = act + '/squared'
            f_loss = theanotools.L2

        #---------------------------------------------------------------------------------
        # RNN
        #---------------------------------------------------------------------------------

        # Dims: time, trials, units
        # u[:,:,:Nin]  contains the inputs (including baseline and noise),
        # u[:,:,Nin:]  contains the recurrent noise
        u   = T.tensor3('u')
        x0_ = T.alloc(x0, u.shape[1], x0.shape[0])

        if Nin > 0:
            def rnn(u_t, x_tm1, r_tm1, WinT, WrecT):
                x_t = ((1 - alpha)*x_tm1
                       + alpha*(T.dot(r_tm1, WrecT)        # Recurrent
                                + brec                     # Bias
                                + T.dot(u_t[:,:Nin], WinT) # Input
                                + u_t[:,Nin:])             # Recurrent noise
                       )
                r_t = f_hidden(x_t)

                return [x_t, r_t]

            [x, r], _ = theano.scan(fn=rnn,
                                    outputs_info=[x0_, f_hidden(x0_)],
                                    sequences=u,
                                    non_sequences=[Win_.T, Wrec_.T])
        else:
            def rnn(u_t, x_tm1, r_tm1, WrecT):
                x_t = ((1 - alpha)*x_tm1
                       + alpha*(T.dot(r_tm1, WrecT) # Recurrent
                                + brec              # Bias
                                + u_t[:,Nin:]) # Recurrent noise
                       )
                r_t = f_hidden(x_t)

                return [x_t, r_t]

            [x, r], _ = theano.scan(fn=rnn,
                                    outputs_info=[x0_, f_hidden(x0_)],
                                    sequences=u,
                                    non_sequences=[Wrec_.T])

        #---------------------------------------------------------------------------------
        # Running mode
        #---------------------------------------------------------------------------------

        if self.p['mode'] == 'continuous':
            settings['mode'] = 'continuous'

            if self.p['n_gradient'] != 1:
                print("[ Trainer.train ] In continuous mode,"
                      " so we're setting n_gradient to 1.")
                self.p['n_gradient'] = 1

            x0_ = x[-1]
        else:
            settings['mode'] = 'batch'

        #---------------------------------------------------------------------------------
        # Readout
        #---------------------------------------------------------------------------------

        z = f_output(T.dot(r, Wout_.T) + bout)

        #---------------------------------------------------------------------------------
        # Deduce whether the task specification contains an output mask -- use a
        # temporary dataset so it doesn't affect the training.
        #---------------------------------------------------------------------------------

        dataset = Dataset(1, task, self.floatX, self.p, name='gradient')
        if dataset.has_output_mask():
            settings['output mask'] = 'yes'
        else:
            settings['output mask'] = 'no'

        #---------------------------------------------------------------------------------
        # Loss
        #---------------------------------------------------------------------------------

        # (time, trials, outputs)
        target = T.tensor3('target')

        # Set mask
        mask     = target[:,:,Nout:]
        masknorm = T.sum(mask)

        # Input-output pairs
        inputs = [u, target]
        # target[:,:,:Nout] contains the target outputs, &
        # target[:,:,Nout:] contains the mask.

        # Loss, not including the regularization terms
        loss = T.sum(f_loss(z, target[:,:,:Nout])*mask)/masknorm

        # Root-mean-squared error
        error = T.sqrt(T.sum(theanotools.L2(z, target[:,:,:Nout])*mask)/masknorm)

        #---------------------------------------------------------------------------------
        # Regularization terms
        #---------------------------------------------------------------------------------

        regs = 0

        #---------------------------------------------------------------------------------
        # L1 weight regularization
        #---------------------------------------------------------------------------------

        lambda1 = self.p['lambda1_in']
        if lambda1 > 0:
            settings['L1 weight regularization (Win)'] = ('lambda1_in = {}'
                                                          .format(lambda1))
            regs += lambda1 * T.mean(abs(Win))

        lambda1 = self.p['lambda1_rec']
        if lambda1 > 0:
            settings['L1 weight regularization (Wrec)'] = ('lambda1_rec = {}'
                                                           .format(lambda1))
            regs += lambda1 * T.mean(abs(Wrec))

        lambda1 = self.p['lambda1_out']
        if lambda1 > 0:
            settings['L1 weight regularization (Wout)'] = ('lambda1_out = {}'
                                                           .format(lambda1))
            regs += lambda1 * T.mean(abs(Wout))

        #---------------------------------------------------------------------------------
        # L2 weight regularization
        #---------------------------------------------------------------------------------

        if Nin > 0:
            lambda2 = self.p['lambda2_in']
            if lambda2 > 0:
                settings['L2 weight regularization (Win)'] = ('lambda2_in = {}'
                                                              .format(lambda2))
                regs += lambda2 * T.mean(Win**2)

        lambda2 = self.p['lambda2_rec']
        if lambda2 > 0:
            settings['L2 weight regularization (Wrec)'] = ('lambda2_rec = {}'
                                                           .format(lambda2))
            regs += lambda2 * T.mean(Wrec**2)

        lambda2 = self.p['lambda2_out']
        if lambda2 > 0:
            settings['L2 weight regularization (Wout)'] = ('lambda2_out = {}'
                                                           .format(lambda2))
            regs += lambda2 * T.mean(Wout**2)

        #---------------------------------------------------------------------------------
        # L2 rate regularization
        #---------------------------------------------------------------------------------

        lambda2 = self.p['lambda2_r']
        if lambda2 > 0:
            settings['L2 rate regularization'] = 'lambda2_r = {}'.format(lambda2)
            regs += lambda2 * T.mean(r**2)

        #---------------------------------------------------------------------------------
        # Final costs
        #---------------------------------------------------------------------------------

        costs = [loss, error]

        #---------------------------------------------------------------------------------
        # Datasets
        #---------------------------------------------------------------------------------

        gradient_data   = Dataset(self.p['n_gradient'], task, self.floatX, self.p,
                                  batch_size=self.p['gradient_batch_size'],
                                  seed=self.p['gradient_seed'],
                                  name='gradient')
        validation_data = Dataset(self.p['n_validation'], task, self.floatX, self.p,
                                  batch_size=self.p['validation_batch_size'],
                                  seed=self.p['validation_seed'],
                                  name='validation')

        # Input noise
        if np.isscalar(self.p['var_in']):
            if Nin > 0:
                settings['sigma_in'] = '{}'.format(np.sqrt(self.p['var_in']))
        else:
            settings['sigma_in'] = 'array'

        # Recurrent noise
        if np.isscalar(self.p['var_rec']):
            settings['sigma_rec'] = '{}'.format(np.sqrt(self.p['var_rec']))
        else:
            settings['sigma_rec'] = 'array'

        # Dataset settings
        settings['rectify inputs']            = self.p['rectify_inputs']
        settings['gradient minibatch size']   = gradient_data.minibatch_size
        settings['validation minibatch size'] = validation_data.minibatch_size

        #---------------------------------------------------------------------------------
        # Other settings
        #---------------------------------------------------------------------------------

        settings['dt'] = '{} ms'.format(self.p['dt'])
        if np.isscalar(self.p['tau']):
            settings['tau'] = '{} ms'.format(self.p['tau'])
        else:
            settings['tau'] = 'custom'
        settings['tau_in']            = '{} ms'.format(self.p['tau_in'])
        settings['learning rate']     = '{}'.format(self.p['learning_rate'])
        settings['lambda_Omega']      = '{}'.format(self.p['lambda_Omega'])
        settings['max gradient norm'] = '{}'.format(self.p['max_gradient_norm'])

        #---------------------------------------------------------------------------------
        # A few important Theano settings
        #---------------------------------------------------------------------------------

        settings['(Theano) floatX']   = self.floatX
        settings['(Theano) allow_gc'] = theano.config.allow_gc

        #---------------------------------------------------------------------------------
        # Train!
        #---------------------------------------------------------------------------------

        print_settings(settings)

        sgd = SGD(trainables, inputs, costs, regs, x, z, self.p, save_values,
                  {'Wrec_': Wrec_, 'd_f_hidden': d_f_hidden})
        sgd.train(gradient_data, validation_data, savefile)
# PARAMETERS
memory_cell_size = 10
memory_cell_count = 2

cell1 = LSTMCell(x, input_dim, memory_cell_size)
cell2 = LSTMCell(x, input_dim, memory_cell_size)

# hidden to output
wy = theano.shared(value=init_array((memory_cell_size*memory_cell_count, output_dim)), name='wy')
by = theano.shared(value=init_array(output_dim), name='by')

h1, c1 = cell1.forward_pass()
h2, c2 = cell2.forward_pass()
prediction = T.nnet.sigmoid(T.sum(T.dot(T.concatenate([h1, h2], axis=1), wy) + by, axis=0))

nll = T.mean(T.nnet.binary_crossentropy(prediction, y))

params = cell1.params + cell2.params + [wy, by]

# training
lr = 0.005

dparams = T.grad(nll, params)
updates = OrderedDict({p: (p - lr*dp) for p, dp in zip(params, dparams)})

train = theano.function(inputs=[x, y], outputs=nll, updates=updates)
test = theano.function(inputs=[x, y], outputs=nll)
predict = theano.function(inputs=[x], outputs=prediction)

# number of training epochs, i.e., passes over training set.
epoch_count = 10
Example #59
0
def main(n=6, num_epochs=30, model=None, **kwargs):
    """
    Args:
        **kwargs:
        - path: direct path to CIFAR-10 or TinyImageNet
        - data: "cifar-10" or "tiny-image-net"
        - type: 'resnet' or 'resfuse' or 'resfuse-max'
    """

    # Unpack keyword arguments
    path = kwargs.pop('path', './cifar-10-batches-py')
    data_name = kwargs.pop('data', 'cifar-10')
    model_type = kwargs.pop('type', 'resfuse')

    # Check if cifar data exists
    if not os.path.exists(path):
        print(
            "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'."
        )
        print("Or download Tiny-imagenet-A :)")
        return

    # Load the dataset
    print("Loading data...")

    data = None
    if data_name == 'cifar-10':
        data = load_data()
    elif data_name == 'tiny-image-net':
        sub_sample = kwargs.pop('subsample', 0.1)
        data = load_tiny_imagenet(path,
                                  sub_sample=sub_sample,
                                  subtract_mean=True,
                                  dtype=theano.config.floatX)
        data['X_test'] = data['X_val']
        data['Y_test'] = data['y_val']
        data['Y_train'] = data['y_train']

    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    if model_type == 'resnet':  # 'resnet' or 'resfuse' or 'resfuse-max'
        # network = build_cnn(input_var, n)
        network = build_resfuse_net(input_var, projection=False)
        print("ResNet")
    elif model_type == 'resfuse':
        network = build_resfuse_net(input_var, projection=True)
        print("ResFuse Net")
    elif model_type == 'highway':
        network = build_highway_net(input_var)
        print("Highway Net")
    else:
        raise ValueError(
            "model type must be from resnet, resfuse, resfuse-max")

    print("number of parameters in model: %d" %
          lasagne.layers.count_params(network, trainable=True))

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(
            prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        lr = 0.1
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(loss,
                                           params,
                                           learning_rate=sh_lr,
                                           momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var],
                             [test_loss, test_acc, test_prediction])

    if model is None:
        # launch the training loop
        print("Starting training...")
        training_start_time = time.time()
        best_val_acc = 0.0

        # We iterate over epochs:
        for epoch in range(num_epochs):
            # shuffle training data
            train_indices = np.arange(X_train.shape[0])
            np.random.shuffle(train_indices)
            X_train = X_train[train_indices, :, :, :]
            Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(X_train,
                                             Y_train,
                                             128,
                                             shuffle=True,
                                             augment=True):
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                train_batches += 1

            # And a full pass over the validation data:
            val_err = 0
            val_acc = 0
            val_batches = 0
            top5accuracy = 0.0
            for batch in iterate_minibatches(X_test,
                                             Y_test,
                                             500,
                                             shuffle=False):
                inputs, targets = batch
                err, acc, test_prediction = val_fn(inputs, targets)
                top5accuracy += topKAccuracy(test_prediction, targets)
                val_err += err
                val_acc += acc
                val_batches += 1

            # Then we print the results for this epoch:
            print("Epoch {} of {} took {:.3f}m".format(
                epoch + 1, num_epochs, (time.time() - start_time) / 60.0))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
            print("  validation accuracy:\t\t{:.2f} %".format(
                val_acc / val_batches * 100))
            print(" top 5 validation accuracy:\t\t{:.2f} %".format(
                top5accuracy / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch + 1) == 40 or (epoch + 1) == 70:
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:" + str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

            # decay learning rate when a plateau is hit
            # when overall validation acc becomes negative or increases smaller than 0.01
            # we decay learning rate by 0.8

            # if (val_acc / val_batches) - best_val_acc <= 0.005:
            #     new_lr = sh_lr.get_value() * 0.995
            #     print("New LR:" + str(new_lr))
            #     sh_lr.set_value(lasagne.utils.floatX(new_lr))

            if (val_acc / val_batches) > best_val_acc:
                best_val_acc = val_acc / val_batches

        # print out total training time
        print("Total training time: {:.3f}m".format(
            (time.time() - training_start_time) / 60.0))

        # dump the network weights to a file :
        npz_file_name = ''
        if data_name == 'cifar-10':
            npz_file_name = 'cifar10_deep_residual_model.npz'
        else:
            npz_file_name = 'tiny_imagen_a_epochs_' + str(num_epochs) + '_n_' + str(n) + "_" \
                            + time_string() + "_model.npz"

        np.savez(npz_file_name, *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        with np.load(model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)

    # Calculate validation error of model:
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(X_test, Y_test, 128, shuffle=False):
        inputs, targets = batch
        err, acc, predictions = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
Example #60
0
    extra_dims=1)
actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
returns_var = TT.vector('returns')

# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution

# Note that we negate the objective, since most optimizers assume a minimization problem
surr = -TT.mean(
    dist.log_likelihood_sym(actions_var, dist_info_vars) * returns_var)

# Get the list of trainable parameters.
params = policy.get_params(trainable=True)
grads = theano.grad(surr, params)

f_train = theano.function(inputs=[observations_var, actions_var, returns_var],
                          outputs=None,
                          updates=adam(grads,
                                       params,
                                       learning_rate=learning_rate),
                          allow_input_downcast=True)

for _ in xrange(n_itr):

    paths = []