def __call__(self, model, X): if X.name is None: X_name = 'X' else: X_name = X.name m_data = X.shape[0] m_noise = m_data * self.noise_per_clean Y = self.noise.random_design_matrix(m_noise) #Y = Print('Y',attrs=['min','max'])(Y) #hx = self.h(X, model) #hy = self.h(Y, model) log_hx = -T.nnet.softplus(-self.G(X,model)) log_one_minus_hy = -T.nnet.softplus(self.G(Y,model)) #based on equation 3 of the paper #ours is the negative of theirs because they maximize it and we minimize it rval = -T.mean(log_hx)-T.mean(log_one_minus_hy) rval.name = 'NCE('+X_name+')' return rval
def __init__(self, x, y, in_size, out_size, prefix='lr_'): self.W = theano.shared( value=np.random.uniform( low=-np.sqrt(6. / (in_size + out_size)), high=np.sqrt(6. / (in_size + out_size)), size=(in_size, out_size) ).astype(theano.config.floatX), name='W', borrow=True ) self.b = theano.shared( value=np.random.uniform( low=-np.sqrt(6. / (in_size + out_size)), high=np.sqrt(6. / (in_size + out_size)), size=(out_size,) ).astype(theano.config.floatX), name='b', borrow=True ) self.y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b) self.y_d = T.argmax(self.y_given_x, axis=1) self.loss = -T.mean(T.log(self.y_given_x)[T.arange(y.shape[0]), y]) self.error = T.mean(T.neq(self.y_d, y)) self.params = {prefix+'W': self.W, prefix+'b': self.b}
def get_updates(self, v): # Contrastive divergence chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk) # [Expected] negative log-likelihood cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0) #Regularization cost += self.regularization # Gradients (use automatic differentiation) # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant gparams = T.grad(cost, self.parameters, consider_constant=[chain_end]) gradients = dict(zip(self.parameters, gparams)) # Get learning rates for all params given their gradient. lr, updates_lr = self.learning_rate(gradients) updates = OrderedDict() updates.update(updates_CD) # Add updates from CD updates.update(updates_lr) # Add updates from learning_rate # Updates parameters for param, gparam in gradients.items(): updates[param] = param - lr[param] * gradients[param] return updates
def get_cost_updates(self, contraction_level, learning_rate, cost_measure="cross_entropy"): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) if cost_measure=="cross_entropy": #self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) self.L_rec = T.mean(- T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z),axis=1)) elif cost_measure=="euclidean": self.L_rec = T.mean(T.sum((self.x-z)**2,axis=1)) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.mean(T.sum(J ** 2) / self.n_batchsize) cost = self.L_rec + contraction_level * self.L_jacob # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates)
def get_lossfun(self, l1, l2): """ Generate a loss function The default one is mean negative log-likelihood :param l1: weight of L1 term, None for no L1 term :param l2: weight of L2 term, None for no L2 term """ if self.ff_net.layers[-1].activation_name == 'softmax': q = -T.mean( # minimize negative log-likelihood T.log( self.ff_net.get_learning_passthrough(self.x) ) [T.arange(self.y.shape[0]), self.y] ) else: q = T.mean( # minimize error function (self.ff_net.get_learning_passthrough(self.x) - self.y)**2 ) try: if l1 is not None: q = q + self.ff_net.l1 * l1 except AttributeError: pass try: if l2 is not None: q = q + self.ff_net.l2 * l2 except AttributeError: pass return q
def ml_cost(self, pos_v, neg_v): pos_cost = T.mean(self.free_energy_v(pos_v)) # Only the temperature 1 samples are used to compute the gradient. neg_cost = T.mean(self.free_energy_v(neg_v[:self.batch_size])) cost = pos_cost - neg_cost # build gradient of cost with respect to model parameters return costmod.Cost(cost, self.params(), [pos_v, neg_v])
def _test_layer_stats(self, layer_output): """ DESCRIPTION: This method is called every batch whereby the examples from test or valid set is pass through, the final result will be the mean of all the results from all the batches in an epoch from the test set or valid set. PARAM: layer_output: the output from the layer RETURN: A list of tuples of [('name_a', var_a), ('name_b', var_b)] whereby var is scalar """ w_len = T.sqrt((self.W ** 2).sum(axis=0)) max_length = T.max(w_len) mean_length = T.mean(w_len) min_length = T.min(w_len) return [('max_col_length', max_length), ('mean_col_length', mean_length), ('min_col_length', min_length), ('output_max', T.max(layer_output)), ('output_mean', T.mean(layer_output)), ('output_min', T.min(layer_output)), ('max_W', T.max(self.W)), ('mean_W', T.mean(self.W)), ('min_W', T.min(self.W)), ('max_b', T.max(self.b)), ('mean_b', T.mean(self.b)), ('min_b', T.min(self.b))]
def get_cost_updates(self, learning_rate, lam = 0.0001, beta=3, rho = 0.1): """ :type scalar :param learning_rate: rate which weighs the gradient step :type scalar :param lam: regularization parameter for the cost function :type pair (cost, update) :return: compute cost and update for one training step of the autoencoder """ # y holds all the minibatch-processed vectors h = self.get_hidden_values(self.X) y = self.get_output(h) # Compute the cost l2_squared = (self.Wvis ** 2).sum() + (self.Whid ** 2).sum() KL = T.abs_(rho - T.mean(h)) # True KL?? How to deal with distribution...T.log(T.true_div(rho,rho_hat)) cost = 0.5*T.mean((y - self.X) ** 2)+0.5*lam*l2_squared # + beta*KL # Compute updates gparams = T.grad(cost, self.params) updates = [(param, param - learning_rate * gparam) for param, gparam in zip(self.params, gparams)] return cost, updates
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8): """ batchnorm with support for not using scale and shift parameters as well as inference values (u and s) and partial batchnorm (via a) will detect and use convolutional or fully connected version """ g = rescale b = reshift if X.ndim == 4: if u is not None and s is not None: # use normalization params given a priori b_u = u.dimshuffle('x', 0, 'x', 'x') b_s = s.dimshuffle('x', 0, 'x', 'x') else: # compute normalization params from input b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x') # batch normalize X = (X - b_u) / T.sqrt(b_s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x') elif X.ndim == 2: if u is None and s is None: # compute normalization params from input u = T.mean(X, axis=0) s = T.mean(T.sqr(X - u), axis=0) # batch normalize X = (X - u) / T.sqrt(s + e) if g is not None and b is not None: # apply rescale and reshift X = X*T.exp(0.2*g) + b else: raise NotImplementedError return X
def _get_cost_update(self, lr=0.1, persistent=None, k=1): # compute positive phase chain_start = self.x [pre_sigmoid_nvs, nv_means, nv_samples], updates = \ theano.scan(self._gibbs_vhv, outputs_info=[None, None, chain_start], n_steps=k) # determine gradients on RBM parameters # note that we only need the sample at the end of the chain chain_end = nv_samples[-1] # Contrastive Loss, different from AE(cross entropy loss) cost = T.mean(self.free_energy(chain_start)) - T.mean( self.free_energy(chain_end)) # We must not compute the gradient through the gibbs sampling gparams = T.grad(cost, self.params, consider_constant=[chain_end]) # constructs the update dictionary for gparam, param in zip(gparams, self.params): # make sure that the learning rate is of the right dtype updates[param] = param - gparam * T.cast(lr, dtype=theano.config.floatX) monitoring_cost = self._get_reconstruction_cost(pre_sigmoid_nvs[-1]) return monitoring_cost, updates
def get_monitoring_channels(self, V): try: self.compile_mode() rval = {} #from_ip = self.inference_procedure.get_monitoring_channels(V, self) #rval.update(from_ip) if self.monitor_params: for param in self.get_params(): rval[param.name + '_min'] = full_min(param) rval[param.name + '_mean'] = T.mean(param) rval[param.name + '_max'] = full_max(param) if 'W' in param.name: norms = theano_norms(param) rval[param.name + '_norms_min' ]= T.min(norms) rval[param.name + '_norms_mean'] = T.mean(norms) rval[param.name + '_norms_max'] = T.max(norms) new_rval = {} for key in rval: new_rval[self.monitoring_channel_prefix+key] = rval[key] rval = new_rval return rval finally: self.deploy_mode()
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) train_loss = weighted_loss(self.y, self.y_train, self.weights) test_loss = weighted_loss(self.y, self.y_test, self.weights) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test))) else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def cost_updates(self,lr,data,k=1): ph_activation_scores = T.dot(data,self.W) + self.h_bias ph_activation_probs, ph_samples, ph_updates = self.h.sample(ph_activation_scores) chain_start = ph_samples [nv_activation_scores,nv_activation_probs,nv_samples,\ nh_activation_scores,nh_activation_probs,nh_samples], updates = \ theano.scan( self.gibbs_hvh, outputs_info = [None,None,None,None,None,chain_start], n_steps = k ) chain_end = nv_samples[-1] cost = T.mean(self.free_energy(data))\ - T.mean(self.free_energy(chain_end))\ + self.regularisation() gparams = T.grad(cost,self.tunables,consider_constant=[chain_end]) alpha = T.cast(self.momentum,dtype=theano.config.floatX) updates = [ ( param, param - ( alpha * prev_chg + gparam * lr ) ) for gparam,param,prev_chg in zip(gparams,self.tunables,self.deltas) ] + [ ( prev_chg, alpha * prev_chg + gparam * lr ) for prev_chg,gparam in zip(self.deltas,gparams) ]# + ph_updates + nv_updates + nh_updates monitoring_cost = self.reconstruction_cost(updates,nv_activation_scores[-1],data) return monitoring_cost,updates
def error_classification(self,target): output, updates = theano.scan(fn=lambda a: T.nnet.softmax(a), sequences=[self.output]) y=T.mean(output,0) self.y_pred = T.argmax(y, axis=1) label=T.argmax(target, axis=1) return T.mean(T.neq(self.y_pred, label))
def add_regularization(self, layer): regularization = 0 if self._recon_strategy == 'forward': input_x = layer.x recon_x = layer.reconstruct_x() input_y = layer.y recon_y = layer.reconstruct_y() regularization += Tensor.mean((abs(input_x - recon_x)).sum(axis=1, dtype=Tensor.config.floatX)) regularization += Tensor.mean((abs(input_y - recon_y)).sum(axis=1, dtype=Tensor.config.floatX)) elif self._recon_strategy == 'backward': input_x = layer.x recon_x = Tensor.dot(layer.output_forward_x, layer.Wx.T) input_y = layer.y recon_y = Tensor.dot(layer.output_forward_y, layer.Wy.T) regularization += Tensor.mean((abs(input_x - recon_x)).sum(axis=1, dtype=Tensor.config.floatX)) regularization += Tensor.mean((abs(input_y - recon_y)).sum(axis=1, dtype=Tensor.config.floatX)) return regularization
def unet_crossentropy_loss_sampled(y_true, y_pred): print 'unet_crossentropy_loss_sampled' epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices indPos = T.nonzero(y_true)[0] # no idea why this is a tuple indNeg = T.nonzero(1-y_true)[0] # shuffle n = indPos.shape[0] indPos = indPos[srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) average_loss = T.mean(loss_vector) print 'average_loss:', average_loss return average_loss
def stddev_bias(x, eps, axis=0): mu = T.mean(x + eps, axis=axis) mu.name = "std_mean" var = T.mean((x - mu)**2 + eps) var.name = "std_variance" stddev = T.sqrt(var) return stddev
def image_categorical_crossentropy(output, target, from_logits=False): output = T.clip(output, _EPSILON, 1.0 - _EPSILON) output_ = K.reshape(output, (-1, 256)) target_ = K.reshape(target, (-1, 256)) out = T.nnet.categorical_crossentropy(output_, target_) out = K.reshape(out,(K.shape(output)[0],-1)) return T.mean(T.mean(out, axis=1))
def unet_crossentropy_loss(y_true, y_pred): weight_class_1 = 1. epsilon = 1.0e-4 y_pred_clipped = T.clip(y_pred, epsilon, 1.0-epsilon) loss_vector = -T.mean(weight_class_1*y_true * T.log(y_pred_clipped) + (1-y_true) * T.log(1-y_pred_clipped), axis=1) average_loss = T.mean(loss_vector) return average_loss
def finetune_cost_updates(self, center, mu, learning_rate): """ This function computes the cost and the updates .""" # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, withd one entry per # example in minibatch network_output = self.get_output() temp = T.pow(center - network_output, 2) L = T.sum(temp, axis=1) # Add the network reconstruction error z = self.get_network_reconst() reconst_err = T.sum(T.pow(self.x - z, 2), axis = 1) L = self.beta*L + self.lbd*reconst_err cost1 = T.mean(L) cost2 = self.lbd*T.mean(reconst_err) cost3 = cost1 - cost2 # compute the gradients of the cost of the `dA` with respect # to its parameters gparams = T.grad(cost1, self.params) # generate the list of updates updates = [] grad_values = [] param_norm = [] for param, delta, gparam in zip(self.params, self.delta, gparams): updates.append( (delta, mu*delta - learning_rate * gparam) ) updates.append( (param, param + mu*mu*delta - (1+mu)*learning_rate*gparam )) grad_values.append(gparam.norm(L=2)) param_norm.append(param.norm(L=2)) grad_ = T.stack(*grad_values) param_ = T.stack(*param_norm) return ((cost1, cost2, cost3, grad_, param_), updates)
def get_output_for(self, input, init=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.tensordot(input, self.W, [[1], [0]]) abs_dif = (T.sum(abs(activation.dimshuffle(0,1,2,'x') - activation.dimshuffle('x',1,2,0)),axis=2) + 1e6 * T.eye(input.shape[0]).dimshuffle(0,'x',1)) if init: mean_min_abs_dif = 0.5 * T.mean(T.min(abs_dif, axis=2),axis=0) abs_dif /= mean_min_abs_dif.dimshuffle('x',0,'x') self.init_updates = [(self.log_weight_scale, self.log_weight_scale-T.log(mean_min_abs_dif).dimshuffle(0,'x'))] f = T.sum(T.exp(-abs_dif),axis=2) if init: mf = T.mean(f,axis=0) f -= mf.dimshuffle('x',0) self.init_updates.append((self.b, -mf)) else: f += self.b.dimshuffle('x',0) return T.concatenate([input, f], axis=1)
def get_cost_updates(self, contraction_level, learning_rate): """ This function computes the cost and the updates for one trainng step of the cA """ y = self.get_hidden_values(self.x) z = self.get_reconstructed_input(y) J = self.get_jacobian(y, self.W) # note : we sum over the size of a datapoint; if we are using # minibatches, L will be a vector, with one entry per # example in minibatch self.L_rec = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1 - z), axis=1) # Compute the jacobian and average over the number of samples/minibatch self.L_jacob = T.sum(J ** 2) // self.n_batchsize # note : L is now a vector, where each element is the # cross-entropy cost of the reconstruction of the # corresponding example of the minibatch. We need to # compute the average of all these to get the cost of # the minibatch cost = T.mean(self.L_rec) + contraction_level * T.mean(self.L_jacob) # compute the gradients of the cost of the `cA` with respect # to its parameters gparams = T.grad(cost, self.params) # generate the list of updates updates = [] for param, gparam in zip(self.params, gparams): updates.append((param, param - learning_rate * gparam)) return (cost, updates)
def __init__(self, fin, f1, nin1, f2, nin2, f3, nin3, expand, h1, outputs, lr, C, pDropConv=0.2, pDropHidden=0.5): # 超参数 self.lr = lr self.C = C self.pDropConv = pDropConv self.pDropHidden = pDropHidden # 所有需要优化的参数放入列表中,分别是连接权重和偏置 self.params = [] self.paramsNIN = [] self.paramsConv = [] # 卷积层,w=(本层特征图个数,上层特征图个数,卷积核行数,卷积核列数),b=(本层特征图个数) self.paramsNIN.append(layerNINParams((f1, fin, nin1, 3, 3), expand)) self.paramsNIN.append(layerNINParams((f2, f1 * expand, nin2, 3, 3), expand)) self.paramsNIN.append(layerNINParams((f3, f2 * expand, nin3, 3, 3), expand)) # 全局平均池化层 self.paramsConv.append(layerConvParams((h1, f3 * expand, 1, 1))) self.paramsConv.append(layerConvParams((outputs, h1, 1, 1))) self.params = self.paramsNIN + self.paramsConv # 定义 Theano 符号变量,并构建 Theano 表达式 self.X = T.tensor4('X') self.Y = T.matrix('Y') # 训练集代价函数 YDropProb = model(self.X, self.params, pDropConv, pDropHidden) self.trNeqs = basicUtils.neqs(YDropProb, self.Y) trCrossEntropy = categorical_crossentropy(YDropProb, self.Y) self.trCost = T.mean(trCrossEntropy) + C * basicUtils.regularizer(flatten(self.params)) # 测试验证集代价函数 YFullProb = model(self.X, self.params, 0., 0.) self.vateNeqs = basicUtils.neqs(YFullProb, self.Y) self.YPred = T.argmax(YFullProb, axis=1) vateCrossEntropy = categorical_crossentropy(YFullProb, self.Y) self.vateCost = T.mean(vateCrossEntropy) + C * basicUtils.regularizer(flatten(self.params))
def test_minres_with_jacobi(): vv = theano.shared(v, name='v') gg = theano.shared(g, name='g') hh = theano.shared(h, name='h') dw = T.dot(v.T,g) / M dv = T.dot(g.T,h) / M da = T.mean(v, axis=0) db = T.mean(g, axis=0) dc = T.mean(h, axis=0) Ldiag_terms = natural.generic_compute_L_diag([vv,gg,hh]) Ms = [Ldiag_term + 0.1 for Ldiag_term in Ldiag_terms] newgrads = minres.minres( lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc), [dw, dv, da, db, dc], rtol=1e-5, damp = 0., maxiter = 10000, Ms = Ms, profile=0)[0] f = theano.function([], newgrads) [new_dw, new_dv, new_da, new_db, new_dc] = f() numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1) numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1) numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1) numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1) numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
def test_linearcg(): vv = theano.shared(v, name='v') gg = theano.shared(g, name='g') hh = theano.shared(h, name='h') dw = T.dot(v.T,g) / M dv = T.dot(g.T,h) / M da = T.mean(v, axis=0) db = T.mean(g, axis=0) dc = T.mean(h, axis=0) newgrads = lincg.linear_cg( lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc), [dw, dv, da, db, dc], rtol=1e-5, maxiter = 30, damp = 0., floatX = floatX, profile=0) f = theano.function([], newgrads) [new_dw, new_dv, new_da, new_db, new_dc] = f() numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1) numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1) numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1) numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1) numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
def plotUpdate(self,updates): ''' >>>get update info of each layer >>>type updates: dict >>>para updates: update dictionary ''' maxdict=T.zeros(shape=(self.deep*2+1,)) mindict=T.zeros(shape=(self.deep*2+1,)) meandict=T.zeros(shape=(self.deep*2+1,)) for i in xrange(self.deep): updw=updates[self.layers[i].w]-self.layers[i].w maxdict=T.set_subtensor(maxdict[2*i],T.max(updw)) mindict=T.set_subtensor(mindict[2*i],T.min(updw)) meandict=T.set_subtensor(meandict[2*i],T.mean(updw)) updb=updates[self.layers[i].b]-self.layers[i].b maxdict=T.set_subtensor(maxdict[2*i+1],T.max(updb)) mindict=T.set_subtensor(mindict[2*i+1],T.min(updb)) meandict=T.set_subtensor(meandict[2*i+1],T.mean(updb)) updw=updates[self.classifier.w]-self.classifier.w maxdict=T.set_subtensor(maxdict[self.deep*2],T.max(updw)) mindict=T.set_subtensor(mindict[self.deep*2],T.min(updw)) meandict=T.set_subtensor(meandict[self.deep*2],T.mean(updw)) return [maxdict,mindict,meandict]
def test_minres_with_xinit(): rng = numpy.random.RandomState(123412) vv = theano.shared(v, name='v') gg = theano.shared(g, name='g') hh = theano.shared(h, name='h') dw = T.dot(v.T,g) / M dv = T.dot(g.T,h) / M da = T.mean(v, axis=0) db = T.mean(g, axis=0) dc = T.mean(h, axis=0) xinit = [ rng.rand(N0,N1), rng.rand(N1,N2), rng.rand(N0), rng.rand(N1), rng.rand(N2)] xinit = [xi.astype(floatX) for xi in xinit] newgrads = minres.minres( lambda xw, xv, xa, xb, xc: natural.compute_Lx(vv,gg,hh,xw,xv,xa,xb,xc), [dw, dv, da, db, dc], rtol=1e-5, damp = 0., maxiter = 10000, xinit = xinit, profile=0)[0] f = theano.function([], newgrads) [new_dw, new_dv, new_da, new_db, new_dc] = f() numpy.testing.assert_almost_equal(Linv_x_w, new_dw, decimal=1) numpy.testing.assert_almost_equal(Linv_x_v, new_dv, decimal=1) numpy.testing.assert_almost_equal(Linv_x_a, new_da, decimal=1) numpy.testing.assert_almost_equal(Linv_x_b, new_db, decimal=1) numpy.testing.assert_almost_equal(Linv_x_c, new_dc, decimal=1)
def forward(self,input_org,train=True,update_batch_stat=True,finetune=False): print "Layer/BatchNormalization" ldim,cdim,rdim = self._internal_shape(input_org) input = input_org.reshape((ldim,cdim,rdim)) if (train): mean = T.mean(input, axis=(0, 2), keepdims=True ) var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True) if(update_batch_stat): finetune_N = theano.clone(self.finetune_N, share_inputs=False) if(finetune): finetune_N.default_update = finetune_N+1 ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX) else: finetune_N.default_update = 0 ratio = self.moving_avg_ratio m = ldim*rdim scale = T.cast(m/(m-1.0),theano.config.floatX) est_mean = theano.clone(self.est_mean, share_inputs=False) est_var = theano.clone(self.est_var, share_inputs=False) est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX) est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX) mean += 0 * est_mean var += 0 * est_var output = self._pbc(self.gamma) * (input - self._pbc(mean)) \ / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta) else: output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \ / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta) return output.reshape(input_org.shape)
def negative_log_likelihood(self, y): """ Return the mean of the negative log-likelihood of the prediction of this model under a given target distribution. .. math:: \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ \ell (\theta=\{W,b\}, \mathcal{D}) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label Note: we use the mean instead of the sum so that the learning rate is less dependent on the batch size """ # y.shape[0] is (symbolically) the number of rows in y, i.e., # number of examples (call it n) in the minibatch # T.arange(y.shape[0]) is a symbolic vector which will contain # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of # Log-Probabilities (call it LP) with one row per example and # one column per class LP[T.arange(y.shape[0]),y] is a vector # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is # the mean (across minibatch examples) of the elements in v, # i.e., the mean log-likelihood across the minibatch. if self.is_binary: -T.mean(T.log(self.p_y_given_x)) return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def create_learn_function(self): losses = sum([c.get_lin_losses() for c in self.clauses if True or c.has_free_argument], []) ws = [] if any([p.arity == 1 for p in self.predicates.values()]): ws += [l.sum_layer.w for l in cortex.hidden_x.layers] ws += [l.sum_layer.b for l in cortex.hidden_x.layers] if any([p.arity == 2 for p in self.predicates.values()]): ws += [l.sum_layer.w for l in cortex.hidden_xy.layers] ws += [l.sum_layer.b for l in cortex.hidden_xy.layers] for p in self.predicates.values(): ws.append(p.out_layer.w) ws.append(p.out_layer.b) alpha = theano.tensor.fscalar() regularisation = alpha * tensor.mean([tensor.mean(w ** 2) for w in ws]) ws += [self.constant_representations] do_update = theano.tensor.bscalar() rp = net3.Momentum(ws, do_update * (tensor.mean(losses) + regularisation)) updates, lr = rp.get_updates() self.rp = rp self.learn_function = function([lr, alpha, do_update], tensor.mean(losses), updates=updates, on_unused_input="ignore")
def cross_entropy(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return T.mean( T.nnet.categorical_crossentropy(self.prob_of_y_given_x, self.y))
def create_objectives(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load network l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ l_qa, l_qz = self.network l_qa_in, l_px_in = self.input_layers # load network output qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \ = lasagne.layers.get_output( [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz], deterministic=deterministic, ) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], {l_px_in: z}, deterministic=deterministic, ) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], {l_px_in: z}, deterministic=deterministic, ) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) log_qza_given_x = log_qz_given_ax + log_qa_given_x # log-probability term z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) if self.model == 'bernoulli': log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) elif self.model == 'gaussian': log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1) log_paxz = log_pa_given_z + log_px_given_z + log_pz # compute the evidence lower bound elbo = T.mean(log_paxz - log_qza_given_x) # we don't use a spearate accuracy metric right now return -elbo, T.mean(qz_logsigma)
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-2, mu=0.9, decay=0.9, epochs=10, batch_sz=100, show_fig=False): X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) updates = momentum_updates(cost, self.params, learning_rate, mu) train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
import theano.tensor as T X = T.dmatrix() y = T.ivector() prepare_data = lambda x: (theano.shared(x[0].astype('float64')), theano.shared(x[1].astype('int32'))) (training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map( prepare_data, [train_set, test_set, valid_set]) W = theano.shared(numpy.zeros([dims, n_classes])) b = theano.shared(numpy.zeros(n_classes)) y_hat = T.nnet.softmax(T.dot(X, W) + b) y_pred = T.argmax(y_hat, axis=1) test_error = T.mean(T.neq(y_pred, y)) training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y]) learning_rate = 0.2 params = [W, b] beta = .9 updates = [] for p in params: ms = theano.shared(1. + 0. * p.get_value()) updates += [ (p, p - learning_rate * T.grad(training_error, p) / T.sqrt(ms)), (ms, beta * ms + (1 - beta) * T.sqr(T.grad(training_error, p))) ] idx = T.ivector() training_function = theano.function(inputs=[idx],
def negative_log_likelihood(self,y): return -T.mean((self.p_y_given_x)[T.arange(y.shape[0]),y])
def downsample2d_nearest_neighbour(x, scale=2): x = x.reshape((x.shape[0], x.shape[1], x.shape[2]/scale, scale, x.shape[3]/scale, scale)) x = T.mean(x, axis=5) x = T.mean(x, axis=3) return x
def mean_squared_error(self): return T.mean((self.x - self.y)**2)
def accuracy(self, y): "Return the accuracy for the mini-batch." return T.mean(T.eq(y, self.y_out))
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, show_fig=False): ''' Takes training data and test data (valid) at once, then trains and validates along the way. Modifying hyperparams of learning_rate, mu, decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether to display a figure are passed as optional variables. ''' X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D # first input layer is the number of features in X count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) # layer ID is just the number self.hidden_layers.append(h) M1 = M2 # input layer to next layer is this layer. count += 1 # output layer weights (last hidden layer to K output classes) W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # function to calc prob Y given X # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) # gradients wrt each param grads = T.grad(cost, self.params) # for momentum ''' np.zeros_like(array) returns an array(/matrix) of the same shape and type of the given array. Very cool, never seen this before. ''' dparams = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] # for rmsprop, initialize cache as 1 cache = [ theano.shared(np.ones_like(p.get_value())) for p in self.params ] ''' Noting for myself that I've never seen this way of using zip to loop through multiple lists/arays with the same indices simultaneously. Makes a lot of sense now, I should see where I can use this to turn loops over indices in my code in to list comprehension that is by ele. ''' # these are the functions for updating the variables of # dparams (momentum) and cache. new_cache = [ decay * c + (1 - decay) * g * g for p, c, g in zip(self.params, cache, grads) ] new_dparams = [ mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10) for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads) ] ''' Using zip to create lists of tuples of the variables themselves, and the fuctions for updating them (cache, momentum params and params), where params are weights (W) and biases (b) for each layer. ''' updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [ (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams) ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)] train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction, more theano graph set-up with tensors # still no values yet in any of these. Training loop next! pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] # theano function defined above that does all the work. # takes the data (like feed_dict in tf). The update calcs were # given to it above as a list for all layers. train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def negative_log_likelihood(self): self.prob_of_y_given_x = T.nnet.softmax(self.x) return -T.mean( T.log(self.prob_of_y_given_x)[T.arange(self.y.shape[0]), self.y])
def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): """ In order to get this to work we need to be careful not to update the actor parameters when updating the critic. This can be an issue when concatenating networks together. The first first network becomes a part of the second. However you can still access the first network by itself but an updates on the second network will effect the first network. Care needs to be taken to make sure only the parameters of the second network are updated. """ super(DeepDPGDQ, self).__init__(n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) batch_size = self.getSettings()['batch_size'] # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, self._state_length) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, self._state_length) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.dmatrix("Action") Action.tag.test_value = np.random.rand(batch_size, self._action_length) # create a small convolutional neural network inputLayerActA = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1ActA = lasagne.layers.DenseLayer( inputLayerActA, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActA = lasagne.layers.DenseLayer( l_hid1ActA, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActA = lasagne.layers.DenseLayer( l_hid2ActA, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActA = lasagne.layers.DenseLayer( l_hid3ActA, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) inputLayerA = lasagne.layers.InputLayer((None, self._state_length), State) concatLayer = lasagne.layers.ConcatLayer( [inputLayerA, self._l_outActA]) l_hid1A = lasagne.layers.DenseLayer( concatLayer, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2A = lasagne.layers.DenseLayer( l_hid1A, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3A = lasagne.layers.DenseLayer( l_hid2A, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outA = lasagne.layers.DenseLayer( l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((self._action_length,)) # self.updateTargetModel() inputLayerActB = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1ActB = lasagne.layers.DenseLayer( inputLayerActB, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActB = lasagne.layers.DenseLayer( l_hid1ActB, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActB = lasagne.layers.DenseLayer( l_hid2ActB, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActB = lasagne.layers.DenseLayer( l_hid3ActB, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) inputLayerB = lasagne.layers.InputLayer((None, self._state_length), State) concatLayerB = lasagne.layers.ConcatLayer( [inputLayerB, self._l_outActB]) l_hid1B = lasagne.layers.DenseLayer( concatLayerB, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2B = lasagne.layers.DenseLayer( l_hid1B, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3B = lasagne.layers.DenseLayer( l_hid2B, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outB = lasagne.layers.DenseLayer( l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear) ################################################################################\ inputLayerActA = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1ActA = lasagne.layers.DenseLayer( inputLayerActA, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActA = lasagne.layers.DenseLayer( l_hid1ActA, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActA = lasagne.layers.DenseLayer( l_hid2ActA, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActATarget = lasagne.layers.DenseLayer( l_hid3ActA, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) inputLayerA = lasagne.layers.InputLayer((None, self._state_length), State) concatLayer = lasagne.layers.ConcatLayer( [inputLayerA, self._l_outActA]) l_hid1A = lasagne.layers.DenseLayer( concatLayer, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2A = lasagne.layers.DenseLayer( l_hid1A, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3A = lasagne.layers.DenseLayer( l_hid2A, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outATarget = lasagne.layers.DenseLayer( l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((self._action_length,)) # self.updateTargetModel() inputLayerActB = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1ActB = lasagne.layers.DenseLayer( inputLayerActB, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2ActB = lasagne.layers.DenseLayer( l_hid1ActB, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3ActB = lasagne.layers.DenseLayer( l_hid2ActB, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outActBTarget = lasagne.layers.DenseLayer( l_hid3ActB, num_units=self._action_length, nonlinearity=lasagne.nonlinearities.linear) inputLayerB = lasagne.layers.InputLayer((None, self._state_length), State) concatLayerB = lasagne.layers.ConcatLayer( [inputLayerB, self._l_outActB]) l_hid1B = lasagne.layers.DenseLayer( concatLayerB, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2B = lasagne.layers.DenseLayer( l_hid1B, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3B = lasagne.layers.DenseLayer( l_hid2B, num_units=32, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outBTarget = lasagne.layers.DenseLayer( l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = self.getSettings()['learning_rate'] self._discount_factor = self.getSettings()['discount_factor'] self._rho = self.getSettings()['rho'] self._rms_epsilon = self.getSettings()['rms_epsilon'] self._weight_update_steps = self.getSettings( )['steps_until_target_network_update'] self._updates = 0 self._decay_weight = self.getSettings()['regularization_weight'] self._states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared( np.zeros((batch_size, self._action_length), dtype=theano.config.floatX), ) self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State) self._q_valsActB = lasagne.layers.get_output(self._l_outActB, State) # self._q_valsActB2 = lasagne.layers.get_output(self._l_outActB, State) inputs_ = { State: self._states_shared, Action: self._q_valsActA, } self._q_valsA = lasagne.layers.get_output(self._l_outA, inputs_) inputs_ = { ResultState: self._next_states_shared, Action: self._q_valsActB, } self._q_valsA_B = lasagne.layers.get_output(self._l_outBTarget, inputs_) inputs_ = { State: self._states_shared, Action: self._q_valsActB, } self._q_valsB = lasagne.layers.get_output(self._l_outB, inputs_) inputs_ = { State: self._next_states_shared, Action: self._q_valsActA, } self._q_valsB_A = lasagne.layers.get_output(self._l_outATarget, inputs_) self._q_func = self._q_valsA self._q_funcAct = self._q_valsActA self._q_funcB = self._q_valsB self._q_funcActB = self._q_valsActB2 # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True) self._target = (Reward + self._discount_factor * self._q_valsA_B) self._diff = self._target - self._q_valsA self._targetB = (Reward + self._discount_factor * self._q_valsB_A) self._diffB = self._target - self._q_valsB self._loss = 0.5 * self._diff**2 + ( self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)) self._loss = T.mean(self._loss) self._lossB = 0.5 * self._diffB**2 + ( self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outB, lasagne.regularization.l2)) self._lossB = T.mean(self._lossB) # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16 # Need to remove the action layers from these params self._params = lasagne.layers.helper.get_all_params( self._l_outA )[-len(lasagne.layers.helper.get_all_params(self._l_outActA)):] self._paramsB = lasagne.layers.helper.get_all_params( self._l_outB )[-len(lasagne.layers.helper.get_all_params(self._l_outActB)):] print("******Number of Layers is: " + str(len(lasagne.layers.helper.get_all_params(self._l_outA)))) print("******Number of Action Layers is: " + str(len(lasagne.layers.helper.get_all_params(self._l_outActA)))) self._actionParams = lasagne.layers.helper.get_all_params( self._l_outActA) self._actionParamsB = lasagne.layers.helper.get_all_params( self._l_outActB) self._givens_ = { State: self._states_shared, # ResultState: self._next_states_shared, Reward: self._rewards_shared, # Action: self._actions_shared, } self._actGivens = { State: self._states_shared, # ResultState: self._next_states_shared, # Reward: self._rewards_shared, # Action: self._actions_shared, } # SGD update #updates_ = rmsprop(loss, params, self._learning_rate, self._rho, # self._rms_epsilon) # TD update # minimize Value function error self._updates_ = rmsprop( T.mean(self._q_func) + (self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)), self._params, self._learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) self._updates_B = rmsprop( T.mean(self._q_funcB) + (self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outB, lasagne.regularization.l2)), self._paramsB, self._learning_rate * -T.mean(self._diffB), self._rho, self._rms_epsilon) # actDiff1 = (Action - self._q_valsActB) #TODO is this correct? # actDiff = (actDiff1 - (Action - self._q_valsActA)) # actDiff = ((Action - self._q_valsActB2)) # Target network does not work well here? #self._actDiff = ((Action - self._q_valsActA)) # Target network does not work well here? #self._actLoss = 0.5 * self._actDiff ** 2 + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)) #self._actLoss = T.mean(self._actLoss) # actionUpdates = rmsprop(actLoss + # (1e-4 * lasagne.regularization.regularize_network_params( # self._l_outActA, lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon) # Maximize wrt q function # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO self._actionUpdates = rmsprop( T.mean(self._q_func) + (self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2)), self._actionParams, self._learning_rate * 0.1, self._rho, self._rms_epsilon) self._actionUpdatesB = rmsprop( T.mean(self._q_funcB) + (self._decay_weight * lasagne.regularization.regularize_network_params( self._l_outActB, lasagne.regularization.l2)), self._actionParamsB, self._learning_rate * 0.1, self._rho, self._rms_epsilon) self._train = theano.function([], [self._loss, self._q_valsA], updates=self._updates_, givens=self._givens_) self._trainB = theano.function([], [self._lossB, self._q_valsB], updates=self._updates_B, givens=self._givens_) self._trainActor = theano.function([], [self._q_valsA], updates=self._actionUpdates, givens=self._actGivens) self._trainActorB = theano.function([], [self._q_valsB], updates=self._actionUpdatesB, givens=self._actGivens) self._q_val = theano.function([], self._q_valsA, givens={State: self._states_shared}) self._q_valB = theano.function([], self._q_valsB, givens={State: self._states_shared}) self._q_action = theano.function([], self._q_valsActA, givens={State: self._states_shared}) self._q_actionB = theano.function([], self._q_valsActB, givens={State: self._states_shared}) # self._q_actionB = theano.function([], self._q_valsActB, givens={State: self._states_shared}) inputs_ = [ State, Reward, # ResultState ] self._bellman_error = theano.function(inputs=inputs_, outputs=self._diff, allow_input_downcast=True) self._bellman_errorB = theano.function(inputs=inputs_, outputs=self._diffB, allow_input_downcast=True)
def cost(self, net): "Return the log-likelihood cost." return -T.mean( T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
def train(self, X, Y, **kwargs): """ Trains the model on the provided data. Parameters ---------- X: np.ndarray (N, D) Input datapoints. The dimensionality of X is (N, D), with N as the number of points and D is the number of features. Y: np.ndarray (N, T) The corresponding target values. The dimensionality of Y is (N, T), where N has to match the number of points of X and T is the number of objectives """ # Normalize inputs self.X = X self.X_mean = np.mean(X) self.X_std = np.std(X) self.norm_X = (X - self.X_mean) / self.X_std if self.X.shape[0] <= self.batch_size: batch_size = self.X.shape[0] else: batch_size = self.batch_size # Normalize ouputs self.Y_mean = np.mean(Y) self.Y_std = np.std(Y) self.Y = (Y - self.Y_mean) / self.Y_std #self.Y = Y start_time = time.time() # Create the neural network features = X.shape[1] self.learning_rate = theano.shared( np.array(self.init_learning_rate, dtype=theano.config.floatX)) self.network = self._build_net(self.input_var, features) prediction = lasagne.layers.get_output(self.network) # Define loss function for training loss = T.mean(T.square(prediction - self.target_var)) / 0.001 # Add l2 regularization for the weights l2_penalty = self.l2 * lasagne.regularization.regularize_network_params( self.network, lasagne.regularization.l2) loss += l2_penalty loss = loss.mean() params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=self.learning_rate) logging.debug("... compiling theano functions") self.train_fn = theano.function([self.input_var, self.target_var], loss, updates=updates, allow_input_downcast=True) # Start training lc = np.zeros([self.num_epochs]) for epoch in range(self.num_epochs): epoch_start_time = time.time() # Full pass over the training data: train_err = 0 train_batches = 0 for batch in self.iterate_minibatches(self.norm_X, self.Y, batch_size, shuffle=True): inputs, targets = batch train_err += self.train_fn(inputs, targets) train_batches += 1 lc[epoch] = train_err / train_batches logging.debug("Epoch {} of {}".format(epoch + 1, self.num_epochs)) curtime = time.time() epoch_time = curtime - epoch_start_time total_time = curtime - start_time logging.debug("Epoch time {:.3f}s, " "total time {:.3f}s".format(epoch_time, total_time)) logging.debug("Training loss:\t\t{:.5g}".format(train_err / train_batches)) #Adapt the learning rate if epoch % self.adapt_epoch == 0: self.learning_rate.set_value( np.float32(self.init_learning_rate * 0.1)) # Design matrix layers = lasagne.layers.get_all_layers(self.network) self.Theta = lasagne.layers.get_output(layers[:-1], self.norm_X)[-1].eval() if self.do_optimize: if self.do_mcmc: self.sampler = emcee.EnsembleSampler( self.n_hypers, 2, self.marginal_log_likelihood) # Do a burn-in in the first iteration if not self.burned: # Initialize the walkers by sampling from the prior self.p0 = self.prior.sample_from_prior(self.n_hypers) # Run MCMC sampling self.p0, _, _ = self.sampler.run_mcmc( self.p0, self.burnin_steps) self.burned = True # Start sampling pos, _, _ = self.sampler.run_mcmc(self.p0, self.chain_length) # Save the current position, it will be the startpoint in # the next iteration self.p0 = pos # Take the last samples from each walker self.hypers = np.exp(self.sampler.chain[:, -1]) else: # Optimize hyperparameters of the Bayesian linear regression res = optimize.fmin(self.nll, np.random.rand(2)) self.hypers = [[np.exp(res[0]), np.exp(res[1])]] else: self.hypers = [[self.alpha, self.beta]] logging.info("Hypers: %s" % self.hypers) self.models = [] for sample in self.hypers: # Instantiate a model for each hyperparameter configuration model = BayesianLinearRegression(alpha=sample[0], beta=sample[1], basis_func=None) model.train(self.Theta, self.Y, do_optimize=False) self.models.append(model)
def build_objective(model, deterministic=False, epsilon=1e-12): p = nn.layers.get_output(model.l_out, deterministic=deterministic) targets = T.flatten(nn.layers.get_output(model.l_target)) p = T.clip(p, epsilon, 1.-epsilon) bce = T.nnet.binary_crossentropy(p, targets) return T.mean(bce)
def __init__(self, rng, input, model_params, self_norm_coeff, activation, dropout, is_test): (self.ngram_size, linear_W_emb, hidden_Ws, hidden_bs, softmax_W, softmax_b) = model_params (in_vocab_size, emb_dim) = linear_W_emb.shape (softmax_in, softmax_out) = softmax_W.shape context_size = self.ngram_size - 1 self.emb_dim = emb_dim self.in_vocab_size = in_vocab_size # linear embeding layer sys.stderr.write( '# linear layer: in_vocab_size=%d, emb_dim=%d, context_size=%d\n' % (in_vocab_size, emb_dim, context_size)) self.linearLayer = LinearLayer(rng, input, emb_dim, context_size, in_vocab_size, linear_W_emb) # hidden layers self.hidden_layers = [] cur_hidden_in = emb_dim * context_size self.num_hidden_layers = len(hidden_Ws) sys.stderr.write('# hidden layers=%d\n' % self.num_hidden_layers) hidden_params = [] prev_layer = self.linearLayer for ii in xrange(self.num_hidden_layers): hidden_W = hidden_Ws[ii] hidden_b = hidden_bs[ii] (hidden_in, hidden_out) = hidden_W.shape assert cur_hidden_in == hidden_in, '! hidden layer %d: cur_hidden_in %d != hidden_in %d\n' % ( ii + 1, cur_hidden_in, hidden_in) sys.stderr.write( ' hidden layer %d: hidden_in=%d, hidden_out=%d\n' % (ii + 1, hidden_in, hidden_out)) hidden_layer = HiddenLayer(rng, prev_layer.output, hidden_in, hidden_out, activation, hidden_W, hidden_b, dropout) self.hidden_layers.append(hidden_layer) hidden_params = hidden_params + hidden_layer.params cur_hidden_in = hidden_out prev_layer = hidden_layer # softmax assert cur_hidden_in == softmax_in, '! softmax layer: cur_hidden_in %d != softmax_in %d\n' % ( ii + 1, cur_hidden_in, softmax_in) sys.stderr.write('# softmax layer: softmax_in=%d, softmax_out=%d\n' % (softmax_in, softmax_out)) self.softmaxLayer = SoftmaxLayer( self.hidden_layers[self.num_hidden_layers - 1].output, softmax_W, softmax_b, self_norm_coeff, is_test) # L1 #self.L1 = abs(self.hidden_layer.W).sum() + abs(self.softmaxLayer.W).sum() # L2 #self.L2 = (self.hidden_layer.W ** 2).sum() + (self.softmaxLayer.W ** 2).sum() # nll self.nll = self.softmaxLayer.nll # sum_ll self.sum_ll = self.softmaxLayer.sum_ll # sum_ll if is_test == 1: self.ind_ll = self.softmaxLayer.ind_ll if is_test == 0 and self_norm_coeff > 0: self.mean_abs_log_norm = T.mean( T.abs_(self.softmaxLayer.log_norm )) # to observe how much we compressed log |Z(x)| self.mean_square_log_norm = T.mean( self.softmaxLayer.log_norm** 2) # for cost function (log Z(x))^2 # params self.params = self.linearLayer.params + hidden_params + self.softmaxLayer.params
def main(train_file, val_file, savename, modelFile, num_epochs=500, alpha=0.1, margin=25, base=0.01, mb_size=50, momentum=0.9, synsets=None): print("Loading data...") print('Alpha: %f' % (alpha, )) print('Save name: %s' % (savename, )) tr_addresses, tr_labels = get_traindata(train_file, synsets) vl_addresses, vl_labels = get_valdata(val_file) N = len(tr_addresses) print('Num training examples: %i' % (N, )) print('Alpha/N: %e' % (alpha / N, )) # Variables input_var = T.tensor4('inputs') target_var = T.ivector('targets') learning_rate = T.fscalar('learning_rate') im_shape = (227, 227) max_grad = 1. print("Building model and compiling functions...") network = build_cnn(im_shape, input_var=input_var) # Losses and updates prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + regularization(prediction, alpha / N).mean() params = lasagne.layers.get_all_params(network, deterministic=False) #updates = lasagne.updates.nesterov_momentum(loss, params, # learning_rate=learning_rate, # momentum=momentum) updates = clipped_nesterov_momentum(loss, params, learning_rate, max_grad, momentum=momentum) # Validation and testing test_prediction = lasagne.layers.get_output(network, deterministic=True) train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Theano functions train_fn = theano.function([input_var, target_var, learning_rate], [loss, train_acc], updates=updates) val_fn = theano.function([input_var, target_var], test_acc) print("Starting training...") # We iterate over epochs: start_time = time.time() for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: learning_rate = get_learning_rate(epoch, margin, base) train_err = 0 train_batches = 0 running_error = [] running_acc = [] acc = 0. trdlg = data_and_label_generator(tr_addresses, tr_labels, im_shape, mb_size, shuffle=True) for batch in threaded_gen(trdlg, num_cached=500): inputs, targets = batch local_train_err, local_train_acc = train_fn( inputs, targets, learning_rate) train_err += local_train_err acc += local_train_acc train_batches += 1 if np.isnan(local_train_err): sys.exit() running_error.append(local_train_err) running_acc.append(local_train_acc) if train_batches % 257 == 0: save_errors(savename, running_error, err_type='error') save_errors(savename, running_acc, err_type='acc') running_error = [] running_acc = [] h, m, s = theTime(start_time) sys.stdout.write( 'Time: %d:%02d:%02d Minibatch: %i Training Error: %f\r' % (h, m, s, train_batches, train_err / train_batches)), sys.stdout.flush() print val_acc = 0 val_batches = 0 running_val_acc = [] vldlg = data_and_label_generator(vl_addresses, vl_labels, im_shape, mb_size) for batch in threaded_gen(vldlg, num_cached=50): inputs, targets = batch val_acc += val_fn(inputs, targets) val_batches += 1 sys.stdout.write('Minibatch: %i Validation Accuracy: %f\r' % (val_batches, val_acc / val_batches * 100)), sys.stdout.flush() running_val_acc.append(val_acc / val_batches) save_errors(savename, running_val_acc, err_type='val_acc') print print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" train loss:\t\t{:.6f}".format(train_err / train_batches)) print(" valid acc:\t\t{:.6f}".format(val_acc / val_batches * 100.)) save_model(network, modelFile)
def L2_cost(self, y): L = T.sum((self.output - y)**2, axis=1) return T.mean(L)
def nll(self, y): """ Mean negative log-lilelihood """ return -T.mean(self.log_p_y_given_x[T.arange(y.shape[0]), y])
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) # make them theano shared self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo ] thX = T.ivector('X') Ei = self.We[thX] # will be a TxD matrix thY = T.ivector('Y') # sentence input: # [START, w1, w2, ..., wn] # sentence target: # [w1, w2, w3, ..., END] def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) costs = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in xrange(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in xrange(N): # problem! many words --> END token are overrepresented # result: generated lines will be very short # we will try to fix in a later iteration # BAD! magic numbers 0 and 1... input_sequence = [0] + X[j] output_sequence = X[j] + [1] # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def softmax_cost(self, y): L = -T.sum(y * T.log(self.output) + (1 - y) * T.log(1 - self.output), axis=1) return T.mean(L)
print l_last_hid.name, lasagne.layers.get_output(l_last_hid, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval( {x_sym: Tdata, xmask_sym: Tmask}).shape l_softmax = lasagne.layers.DenseLayer(l_last_hid, num_units=NUM_CLASS, nonlinearity=lasagne.nonlinearities.softmax, name='SoftmaxOutput') print l_softmax.name, lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}).eval( {x_sym: Tdata, xmask_sym: Tmask}).shape print lasagne.layers.count_params(l_softmax) output_train = lasagne.layers.get_output(l_softmax, inputs={l_in: x_sym, l_mask_enc: xmask_sym}, deterministic=False) #cost function total_cost = T.nnet.categorical_crossentropy(output_train, y_sym.flatten()) mean_cost = T.mean(total_cost) #accuracy function argmax = T.argmax(output_train, axis=-1) eq = T.eq(argmax,y_sym) acc = T.mean(eq) all_parameters = lasagne.layers.get_all_params([l_softmax], trainable=True) print "Trainable Model Parameters" print "-"*40 for param in all_parameters: print param, param.get_value().shape print "-"*40 all_grads = T.grad(mean_cost, all_parameters)
X_te = X[test_ix] y_te = y[test_ix] output_layer = DenseLayer(net['fc7'], num_units=len(CLASSES), nonlinearity=softmax) X_sym = T.tensor4() y_sym = T.ivector() prediction = lasagne.layers.get_output(output_layer, X_sym) loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym) loss = loss.mean() acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym), dtype=theano.config.floatX) params = lasagne.layers.get_all_params(output_layer, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.0001, momentum=0.9) train_fn = theano.function([X_sym, y_sym], loss, updates=updates) val_fn = theano.function([X_sym, y_sym], [loss, acc]) pred_fn = theano.function([X_sym], prediction) def batches(iterable, N): chunk = [] for item in iterable:
network = lasagne.layers.DenseLayer( convpool, num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params, learning_rate=1.0) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) ##compilation # train_fn = theano.function([l_in.input_var, target_var,l_mask.input_var], loss, updates=updates) train_fn = theano.function([l_in.input_var, target_var], loss, updates=updates) val_fn = theano.function([l_in.input_var, target_var], [test_loss, test_acc]) pred_fn = theano.function([l_in.input_var], test_prediction) patience = 30 best_valid = 0 best_valid_epoch = 0 best_weights = None train_history = {}
def train_and_eval_gauss_logit_SB_VAE( dataset, hidden_layer_sizes, hidden_layer_types, latent_size, activations, prior_mu, prior_sigma, n_epochs, batch_size, lookahead, adam_lr, experiment_dir, output_file_base_name, random_seed): rng = np.random.RandomState(random_seed) # LOAD DATA if "mnist_plus_rot" in dataset: datasets = load_mnist_w_rotations(dataset, target_as_one_hot=True, flatten=False, split=(70000, 10000, 20000)) input_layer_size = 28*28 layer_sizes = [input_layer_size] + hidden_layer_sizes out_activation = Sigmoid neg_log_likelihood_fn = calc_binaryVal_negative_log_likelihood print "Dataset: MNIST+rot" elif "mnist" in dataset: # We follow the approach used in [2] to split the MNIST dataset. datasets = load_mnist(dataset, target_as_one_hot=True, flatten=True, split=(45000, 5000, 10000)) input_layer_size = 28*28 layer_sizes = [input_layer_size] + hidden_layer_sizes out_activation = Sigmoid neg_log_likelihood_fn = calc_binaryVal_negative_log_likelihood print "Dataset: MNIST" elif "svhn_pca" in dataset: datasets = load_svhn_pca(dataset, target_as_one_hot=True, train_valid_split=(65000, 8257)) input_layer_size = 500 layer_sizes = [input_layer_size] + hidden_layer_sizes out_activation = Identity neg_log_likelihood_fn = calc_realVal_negative_log_likelihood print "Dataset: SVHN (PCA reduced)" else: print "no data found..." exit() train_set_x, _ = datasets[0] valid_set_x, _ = datasets[1] test_set_x, _ = datasets[2] train_set_size = int(train_set_x.shape[0].eval()) valid_set_size = int(valid_set_x.shape[0].eval()) test_set_size = int(test_set_x.shape[0].eval()) print 'Datasets loaded ({:,} train | {:,} valid | {:,} test)'.format(train_set_size, valid_set_size, test_set_size) # compute number of minibatches for training, validation and testing n_train_batches = train_set_size / batch_size n_test_batches = test_set_size / batch_size n_valid_batches = valid_set_size / batch_size # BUILD MODEL print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # construct the Gaussian Variational Autoencoder model = Gauss_Logit_SB_VAE(rng=rng, input=x, batch_size=batch_size, layer_sizes=layer_sizes, layer_types=hidden_layer_types, activations=activations, latent_size=latent_size, out_activation=out_activation) # Build the expresson for the cost function. data_ll_term = neg_log_likelihood_fn(x, model.x_recon) kl = model.calc_kl_divergence(prior_mu=prior_mu, prior_sigma=prior_sigma) # Compose into final costs cost = T.mean( data_ll_term + kl ) updates = get_adam_updates(cost=cost, params=model.params, lr=adam_lr) # Compile theano function for testing. test_model = theano.function( inputs = [index], outputs = T.mean(neg_log_likelihood_fn(x, model.x_recon)), givens = {x: test_set_x[index * batch_size:(index + 1) * batch_size]}) # Compile theano function for validation. valid_model = theano.function( inputs = [index], outputs = T.mean(neg_log_likelihood_fn(x, model.x_recon)), givens = {x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) # Compile theano function for training. train_model = theano.function( inputs = [index], outputs = [data_ll_term.mean(), kl.mean()], updates = updates, givens = {x: train_set_x[index * batch_size:(index + 1) * batch_size]}) # TRAIN MODEL # print 'Training for {} epochs ...'.format(n_epochs) best_params = None best_valid_error = np.inf best_iter = 0 start_time = time.clock() results_file_name = pjoin(experiment_dir, "gauss_logit_SB_VAE_results_.txt") results_file = open(results_file_name, 'w') stop_training = False for epoch_counter in range(n_epochs): if stop_training: break # Train this epoch epoch_start_time = time.time() avg_training_nll_tracker = 0. avg_training_kl_tracker = 0. for minibatch_index in xrange(n_train_batches): avg_training_nll, avg_training_kl = train_model(minibatch_index) # check for NaN, test model anyway even if one is detected if (np.isnan(avg_training_nll) or np.isnan(avg_training_kl)): print "found NaN...aborting training..." results_file.write("found NaN...aborting training... \n\n") if epoch_counter > 0: for param, best_param in zip(model.params, best_params): param.set_value(best_param) test_error = sum([test_model(i) for i in xrange(n_test_batches)]) / n_test_batches results = "Ended due to NaN! best epoch {}, best valid error {:.4f}, test error {:.4f}, training time {:.2f}m" results = results.format(best_iter, best_valid_error, test_error, (end_time-start_time)/60) print results results_file.write(results + "\n") results_file.close() exit() avg_training_nll_tracker += avg_training_nll avg_training_kl_tracker += avg_training_kl epoch_end_time = time.time() # Compute some infos about training. avg_training_nll_tracker /= (minibatch_index+1) avg_training_kl_tracker /= (minibatch_index+1) # Compute validation error valid_error = sum([valid_model(i) for i in xrange(n_valid_batches)])/n_valid_batches results = "epoch {}, training loss (NLL) {:.4f}, training kl divergence {:.4f}, valid error {:.4f}, time {:.2f} " if valid_error < best_valid_error: best_iter = epoch_counter best_valid_error = valid_error results += " ***" # Save progression best_params = [param.get_value().copy() for param in model.params] #cPickle.dump(best_params, open(pjoin(experiment_dir, 'gauss_vae_params_'+output_file_base_name+'.pkl'), 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(best_params, open(pjoin(experiment_dir, 'gauss_logit_SB_VAE_params_.pkl'), 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) elif epoch_counter-best_iter > lookahead: stop_training = True # Report and save progress. results = results.format(epoch_counter, avg_training_nll_tracker, avg_training_kl_tracker, valid_error, (epoch_end_time-epoch_start_time)/60) print results results_file.write(results + "\n") results_file.flush() end_time = time.clock() # Reload best model. for param, best_param in zip(model.params, best_params): param.set_value(best_param) # Compute test error on best epoch test_error = sum([test_model(i) for i in xrange(n_test_batches)])/n_test_batches results = "Done! best epoch {}, best valid error {:.4f}, test error {:.4f}, training time {:.2f}m" results = results.format(best_iter, best_valid_error, test_error, (end_time-start_time)/60) print results results_file.write(results + "\n") results_file.close() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def get_cost(self, y): # cross-entropy loss L = -T.mean(y * T.log(self.p_y_given_x) + (1 - y) * T.log(1 - self.p_y_given_x)) return L
def build_functions(self, train=False, debug=False, logger=logger_RNNtools): # LSTM in lasagne: see https://github.com/craffel/Lasagne-tutorial/blob/master/examples/recurrent.py # and also http://colinraffel.com/talks/hammer2015recurrent.pdf target_var = self.audio_targets_var #T.imatrix('audio_targets') # if debug: import pdb; self.print_network_structure() network_output = L.get_output(self.network_lout_batch) network_output_flattened = L.get_output(self.network_lout) # (batch_size * batch_max_seq_length, nb_phonemes) # compare targets with highest output probability. Take maximum of all probs (3rd axis (index 2) of output: 1=batch_size (input files), 2 = time_seq (frames), 3 = n_features (phonemes) # network_output.shape = (len(X), 39) -> (nb_inputs, nb_classes) predictions = (T.argmax(network_output, axis=2)) if debug: self.predictions_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], predictions, name='predictions_fn') if debug: predicted = self.predictions_fn(self.X, self.masks) logger.debug('predictions_fn(X).shape: %s', predicted.shape) # logger.debug('predictions_fn(X)[0], value: %s', predicted[0]) if debug: self.output_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], network_output, name='output_fn') n_out = self.output_fn(self.X, self.masks) logger.debug('network_output.shape: \t%s', n_out.shape); # logger.debug('network_output[0]: \n%s', n_out[0]); # # Function to determine the number of correct classifications # which video, and which frames in the video valid_indices_example, valid_indices_seqNr = self.audio_masks_var.nonzero() valid_indices_fn = theano.function([self.audio_masks_var], [valid_indices_example, valid_indices_seqNr], name='valid_indices_fn') # this gets a FLATTENED array of all the valid predictions of all examples of this batch (so not one row per example) # if you want to get the valid predictions per example, you need to use the valid_frames list (it tells you the number of valid frames per wav, so where to split this valid_predictions array) # of course this is trivial for batch_size_audio = 1, as all valid_predictions will belong to the one input wav valid_predictions = predictions[valid_indices_example, valid_indices_seqNr] valid_targets = target_var[valid_indices_example, valid_indices_seqNr] self.valid_targets_fn = theano.function([self.audio_masks_var, target_var], valid_targets, name='valid_targets_fn') self.valid_predictions_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], valid_predictions, name='valid_predictions_fn') # get valid network output valid_network_output = network_output[valid_indices_example, valid_indices_seqNr] if debug: self.valid_network_output_fn = theano.function([self.audio_inputs_var, self.audio_masks_var], valid_network_output) # Functions for computing cost and training top1_acc = T.mean(lasagne.objectives.categorical_accuracy(valid_network_output, valid_targets, top_k=1)) self.top1_acc_fn = theano.function( [self.audio_inputs_var, self.audio_masks_var, self.audio_targets_var], top1_acc) top3_acc = T.mean(lasagne.objectives.categorical_accuracy(valid_network_output, valid_targets, top_k=3)) self.top3_acc_fn = theano.function( [self.audio_inputs_var, self.audio_masks_var, self.audio_targets_var], top3_acc) # # using the lasagne SliceLayer: # # !!!! only works with batch_size == 1 !!!! # # valid_network_output2 = L.get_output(self.network['l7_out_valid']) # self.valid_network_fn = theano.function([self.audio_inputs_var, self.audio_masks_var, # self.audio_valid_indices_var], valid_network_output2) # valid_network_output_flattened = L.get_output(self.network_lout_valid_flattened) # # valid_predictions2 = T.argmax(valid_network_output2,axis=2) # self.valid_predictions2_fn = theano.function( # [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var], # valid_predictions2, name='valid_predictions_fn') # # # Functions for computing cost and training # top1_acc = T.mean(lasagne.objectives.categorical_accuracy( # valid_network_output_flattened, valid_targets.flatten(), top_k=1)) # self.top1_acc_fn = theano.function( # [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var, # self.audio_targets_var], top1_acc) # top3_acc = T.mean(lasagne.objectives.categorical_accuracy( # valid_network_output_flattened, valid_targets.flatten(), top_k=3)) # self.top3_acc_fn = theano.function( # [self.audio_inputs_var, self.audio_masks_var, self.audio_valid_indices_var, # self.audio_targets_var], top3_acc) if debug: try: # only works with batch_size == 1 # valid_preds2 = self.valid_predictions2_fn(self.X, self.masks, self.valid_frames) # logger.debug("all valid predictions of this batch: ") # logger.debug('valid_preds2.shape: %s', valid_preds2.shape) # logger.debug('valid_preds2, value: \n%s', valid_preds2) # valid_out = self.valid_network_fn(self.X, self.masks, self.valid_frames) # logger.debug('valid_out.shape: %s', valid_out.shape) # # logger.debug('valid_out, value: \n%s', valid_out) valid_example, valid_seqNr = valid_indices_fn(self.masks) logger.debug('valid_inds(masks).shape: %s', valid_example.shape) valid_output = self.valid_network_output_fn(self.X, self.masks) logger.debug("all valid outputs of this batch: ") logger.debug('valid_output.shape: %s', valid_output.shape) valid_preds = self.valid_predictions_fn(self.X, self.masks) logger.debug("all valid predictions of this batch: ") logger.debug('valid_preds.shape: %s', valid_preds.shape) logger.debug('valid_preds, value: \n%s', valid_preds) valid_targs = self.valid_targets_fn(self.masks, self.Y) logger.debug('valid_targets.shape: %s', valid_targs.shape) logger.debug('valid_targets, value: \n%s', valid_targs) top1 = self.top1_acc_fn(self.X, self.masks, self.Y) logger.debug("top 1 accuracy: %s", top1*100.0) top3 = self.top3_acc_fn(self.X, self.masks, self.Y) logger.debug("top 3 accuracy: %s", top3*100.0) except Exception as error: print('caught this error: ' + traceback.format_exc()); import pdb; pdb.set_trace() #pdb.set_trace() ## from https://groups.google.com/forum/#!topic/lasagne-users/os0j3f_Th5Q # Pad your vector of labels and then mask the cost: # It's important to pad the label vectors with something valid such as zeros, # since they will still have to give valid costs that can be multiplied by the mask. # The shape of predictions, targets and mask should match: # (predictions as (batch_size*max_seq_len, n_features), the other two as (batch_size*max_seq_len,)) -> we need to get the flattened output of the network for this # this works, using theano masks cost_pointwise = lasagne.objectives.categorical_crossentropy(network_output_flattened, target_var.flatten()) cost = lasagne.objectives.aggregate(cost_pointwise, self.audio_masks_var.flatten()) weight_decay = 1e-5 weightsl2 = lasagne.regularization.regularize_network_params(self.network_lout, lasagne.regularization.l2) cost += weight_decay * weightsl2 self.validate_fn = theano.function([self.audio_inputs_var, self.audio_masks_var, self.audio_targets_var], [cost, top1_acc, top3_acc], name='validate_fn') self.cost_pointwise_fn = theano.function([self.audio_inputs_var, self.audio_masks_var, target_var], cost_pointwise, name='cost_pointwise_fn') if debug: logger.debug('cost pointwise: %s', self.cost_pointwise_fn(self.X, self.masks, self.Y)) try:evaluate_cost = self.validate_fn(self.X, self.masks, self.Y) except: print('caught this error: ' + traceback.format_exc()); pdb.set_trace() logger.debug('cost: {:.3f}'.format(float(evaluate_cost[0]))) logger.debug('accuracy: {:.3f}'.format(float(evaluate_cost[1]*100.0))) logger.debug('top 3 accuracy: {:.3f}'.format(float(evaluate_cost[2]*100.0))) #pdb.set_trace() if train: LR = T.scalar('LR', dtype=theano.config.floatX) # Retrieve all trainable parameters from the network all_params = L.get_all_params(self.network_lout, trainable=True) self.updates = lasagne.updates.adam(loss_or_grads=cost, params=all_params, learning_rate=LR) self.train_fn = theano.function([self.audio_inputs_var, self.audio_masks_var, target_var, LR], [cost, top1_acc, top3_acc], updates=self.updates, name='train_fn')
def train(self, savefile, task, recover=True): """ Train the RNN. Parameters ---------- savefile : str task : function recover : bool, optional If `True`, will attempt to recover from a previously saved run. """ N = self.p['N'] Nin = self.p['Nin'] Nout = self.p['Nout'] alpha = self.p['dt']/self.p['tau'] # Initialize settings settings = OrderedDict() # Check if file already exists if not recover: if os.path.isfile(savefile): os.remove(savefile) #--------------------------------------------------------------------------------- # Are we using GPUs? #--------------------------------------------------------------------------------- if theanotools.get_processor_type() == 'gpu': settings['GPU'] = 'enabled' else: settings['GPU'] = 'no' #--------------------------------------------------------------------------------- # Random number generator #--------------------------------------------------------------------------------- settings['init seed'] = self.p['seed'] rng = np.random.RandomState(self.p['seed']) #--------------------------------------------------------------------------------- # Weight initialization #--------------------------------------------------------------------------------- settings['distribution (Win)'] = self.p['distribution_in'] settings['distribution (Wrec)'] = self.p['distribution_rec'] settings['distribution (Wout)'] = self.p['distribution_out'] if Nin > 0: Win_0 = self.init_weights(rng, self.p['Cin'], N, Nin, self.p['distribution_in']) Wrec_0 = self.init_weights(rng, self.p['Crec'], N, N, self.p['distribution_rec']) Wout_0 = self.init_weights(rng, self.p['Cout'], Nout, N, self.p['distribution_out']) #--------------------------------------------------------------------------------- # Enforce Dale's law on the initial weights #--------------------------------------------------------------------------------- settings['Nin/N/Nout'] = '{}/{}/{}'.format(Nin, N, Nout) if self.p['ei'] is not None: Nexc = len(np.where(self.p['ei'] > 0)[0]) Ninh = len(np.where(self.p['ei'] < 0)[0]) settings['Dale\'s law'] = 'E/I = {}/{}'.format(Nexc, Ninh) if Nin > 0: Win_0 = abs(Win_0) # If Dale, assume inputs are excitatory Wrec_0 = abs(Wrec_0) Wout_0 = abs(Wout_0) else: settings['Dale\'s law'] = 'no' #--------------------------------------------------------------------------------- # Fix spectral radius #--------------------------------------------------------------------------------- # Compute spectral radius C = self.p['Crec'] if C is not None: Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed else: Wrec_0_full = Wrec_0 if self.p['ei'] is not None: Wrec_0_full = Wrec_0_full*self.p['ei'] rho = RNN.spectral_radius(Wrec_0_full) # Scale Wrec to have fixed spectral radius if self.p['ei'] is not None: R = self.p['rho0']/rho else: R = 1.1/rho Wrec_0 *= R if C is not None: C.mask_fixed *= R # Check spectral radius if C is not None: Wrec_0_full = C.mask_plastic*Wrec_0 + C.mask_fixed else: Wrec_0_full = Wrec_0 if self.p['ei'] is not None: Wrec_0_full = Wrec_0_full*self.p['ei'] rho = RNN.spectral_radius(Wrec_0_full) settings['initial spectral radius'] = '{:.2f}'.format(rho) #--------------------------------------------------------------------------------- # Others #--------------------------------------------------------------------------------- brec_0 = self.p['brec']*np.ones(N) bout_0 = self.p['bout']*np.ones(Nout) x0_0 = self.p['x0']*np.ones(N) #--------------------------------------------------------------------------------- # RNN parameters #--------------------------------------------------------------------------------- if Nin > 0: Win = theanotools.shared(Win_0, name='Win') else: Win = None Wrec = theanotools.shared(Wrec_0, name='Wrec') Wout = theanotools.shared(Wout_0, name='Wout') brec = theanotools.shared(brec_0, name='brec') bout = theanotools.shared(bout_0, name='bout') x0 = theanotools.shared(x0_0, name='x0') #--------------------------------------------------------------------------------- # Parameters to train #--------------------------------------------------------------------------------- trainables = [] if Win is not None: trainables += [Win] trainables += [Wrec] if Wout is not None: trainables += [Wout] if self.p['train_brec']: settings['train recurrent bias'] = 'yes' trainables += [brec] else: settings['train recurrent bias'] = 'no' if self.p['train_bout']: settings['train output bias'] = 'yes' trainables += [bout] else: settings['train output bias'] = 'no' # In continuous mode it doesn't make sense to train x0, which is forgotten if self.p['mode'] == 'continuous': self.p['train_x0'] = False if self.p['train_x0']: settings['train initial conditions'] = 'yes' trainables += [x0] else: settings['train initial conditions'] = 'no' #--------------------------------------------------------------------------------- # Weight matrices #--------------------------------------------------------------------------------- # Input if Nin > 0: if self.p['Cin'] is not None: C = self.p['Cin'] settings['sparseness (Win)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Cin_mask_plastic = theanotools.shared(C.mask_plastic) Cin_mask_fixed = theanotools.shared(C.mask_fixed) Win_ = Cin_mask_plastic*Win + Cin_mask_fixed Win_.name = 'Win_' else: Win_ = Win # Recurrent if self.p['Crec'] is not None: C = self.p['Crec'] settings['sparseness (Wrec)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Crec_mask_plastic = theanotools.shared(C.mask_plastic) Crec_mask_fixed = theanotools.shared(C.mask_fixed) Wrec_ = Crec_mask_plastic*Wrec + Crec_mask_fixed Wrec_.name = 'Wrec_' else: Wrec_ = Wrec # Output if self.p['Cout'] is not None: C = self.p['Cout'] settings['sparseness (Wout)'] = ('p = {:.2f}, p_plastic = {:.2f}' .format(C.p, C.p_plastic)) Cout_mask_plastic = theanotools.shared(C.mask_plastic) Cout_mask_fixed = theanotools.shared(C.mask_fixed) Wout_ = Cout_mask_plastic*Wout + Cout_mask_fixed Wout_.name = 'Wout_' else: Wout_ = Wout #--------------------------------------------------------------------------------- # Dale's law #--------------------------------------------------------------------------------- if self.p['ei'] is not None: # Function to keep matrix elements positive if self.p['ei_positive_func'] == 'abs': settings['E/I positivity function'] = 'absolute value' make_positive = abs elif self.p['ei_positive_func'] == 'rectify': settings['E/I positivity function'] = 'rectify' make_positive = theanotools.rectify else: raise ValueError("Unknown ei_positive_func.") # Assume inputs are excitatory if Nin > 0: Win_ = make_positive(Win_) # E/I ei = theanotools.shared(self.p['ei'], name='ei') Wrec_ = make_positive(Wrec_)*ei Wout_ = make_positive(Wout_)*ei #--------------------------------------------------------------------------------- # Variables to save #--------------------------------------------------------------------------------- if Nin > 0: save_values = [Win_] else: save_values = [None] save_values += [Wrec_, Wout_, brec, bout, x0] #--------------------------------------------------------------------------------- # Activation functions #--------------------------------------------------------------------------------- f_hidden, d_f_hidden = theanotools.hidden_activations[self.p['hidden_activation']] settings['hidden activation'] = self.p['hidden_activation'] act = self.p['output_activation'] f_output = theanotools.output_activations[act] if act == 'sigmoid': settings['output activation/loss'] = 'sigmoid/binary cross entropy' f_loss = theanotools.binary_crossentropy elif act == 'softmax': settings['output activation/loss'] = 'softmax/categorical cross entropy' f_loss = theanotools.categorical_crossentropy else: settings['output activation/loss'] = act + '/squared' f_loss = theanotools.L2 #--------------------------------------------------------------------------------- # RNN #--------------------------------------------------------------------------------- # Dims: time, trials, units # u[:,:,:Nin] contains the inputs (including baseline and noise), # u[:,:,Nin:] contains the recurrent noise u = T.tensor3('u') x0_ = T.alloc(x0, u.shape[1], x0.shape[0]) if Nin > 0: def rnn(u_t, x_tm1, r_tm1, WinT, WrecT): x_t = ((1 - alpha)*x_tm1 + alpha*(T.dot(r_tm1, WrecT) # Recurrent + brec # Bias + T.dot(u_t[:,:Nin], WinT) # Input + u_t[:,Nin:]) # Recurrent noise ) r_t = f_hidden(x_t) return [x_t, r_t] [x, r], _ = theano.scan(fn=rnn, outputs_info=[x0_, f_hidden(x0_)], sequences=u, non_sequences=[Win_.T, Wrec_.T]) else: def rnn(u_t, x_tm1, r_tm1, WrecT): x_t = ((1 - alpha)*x_tm1 + alpha*(T.dot(r_tm1, WrecT) # Recurrent + brec # Bias + u_t[:,Nin:]) # Recurrent noise ) r_t = f_hidden(x_t) return [x_t, r_t] [x, r], _ = theano.scan(fn=rnn, outputs_info=[x0_, f_hidden(x0_)], sequences=u, non_sequences=[Wrec_.T]) #--------------------------------------------------------------------------------- # Running mode #--------------------------------------------------------------------------------- if self.p['mode'] == 'continuous': settings['mode'] = 'continuous' if self.p['n_gradient'] != 1: print("[ Trainer.train ] In continuous mode," " so we're setting n_gradient to 1.") self.p['n_gradient'] = 1 x0_ = x[-1] else: settings['mode'] = 'batch' #--------------------------------------------------------------------------------- # Readout #--------------------------------------------------------------------------------- z = f_output(T.dot(r, Wout_.T) + bout) #--------------------------------------------------------------------------------- # Deduce whether the task specification contains an output mask -- use a # temporary dataset so it doesn't affect the training. #--------------------------------------------------------------------------------- dataset = Dataset(1, task, self.floatX, self.p, name='gradient') if dataset.has_output_mask(): settings['output mask'] = 'yes' else: settings['output mask'] = 'no' #--------------------------------------------------------------------------------- # Loss #--------------------------------------------------------------------------------- # (time, trials, outputs) target = T.tensor3('target') # Set mask mask = target[:,:,Nout:] masknorm = T.sum(mask) # Input-output pairs inputs = [u, target] # target[:,:,:Nout] contains the target outputs, & # target[:,:,Nout:] contains the mask. # Loss, not including the regularization terms loss = T.sum(f_loss(z, target[:,:,:Nout])*mask)/masknorm # Root-mean-squared error error = T.sqrt(T.sum(theanotools.L2(z, target[:,:,:Nout])*mask)/masknorm) #--------------------------------------------------------------------------------- # Regularization terms #--------------------------------------------------------------------------------- regs = 0 #--------------------------------------------------------------------------------- # L1 weight regularization #--------------------------------------------------------------------------------- lambda1 = self.p['lambda1_in'] if lambda1 > 0: settings['L1 weight regularization (Win)'] = ('lambda1_in = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Win)) lambda1 = self.p['lambda1_rec'] if lambda1 > 0: settings['L1 weight regularization (Wrec)'] = ('lambda1_rec = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Wrec)) lambda1 = self.p['lambda1_out'] if lambda1 > 0: settings['L1 weight regularization (Wout)'] = ('lambda1_out = {}' .format(lambda1)) regs += lambda1 * T.mean(abs(Wout)) #--------------------------------------------------------------------------------- # L2 weight regularization #--------------------------------------------------------------------------------- if Nin > 0: lambda2 = self.p['lambda2_in'] if lambda2 > 0: settings['L2 weight regularization (Win)'] = ('lambda2_in = {}' .format(lambda2)) regs += lambda2 * T.mean(Win**2) lambda2 = self.p['lambda2_rec'] if lambda2 > 0: settings['L2 weight regularization (Wrec)'] = ('lambda2_rec = {}' .format(lambda2)) regs += lambda2 * T.mean(Wrec**2) lambda2 = self.p['lambda2_out'] if lambda2 > 0: settings['L2 weight regularization (Wout)'] = ('lambda2_out = {}' .format(lambda2)) regs += lambda2 * T.mean(Wout**2) #--------------------------------------------------------------------------------- # L2 rate regularization #--------------------------------------------------------------------------------- lambda2 = self.p['lambda2_r'] if lambda2 > 0: settings['L2 rate regularization'] = 'lambda2_r = {}'.format(lambda2) regs += lambda2 * T.mean(r**2) #--------------------------------------------------------------------------------- # Final costs #--------------------------------------------------------------------------------- costs = [loss, error] #--------------------------------------------------------------------------------- # Datasets #--------------------------------------------------------------------------------- gradient_data = Dataset(self.p['n_gradient'], task, self.floatX, self.p, batch_size=self.p['gradient_batch_size'], seed=self.p['gradient_seed'], name='gradient') validation_data = Dataset(self.p['n_validation'], task, self.floatX, self.p, batch_size=self.p['validation_batch_size'], seed=self.p['validation_seed'], name='validation') # Input noise if np.isscalar(self.p['var_in']): if Nin > 0: settings['sigma_in'] = '{}'.format(np.sqrt(self.p['var_in'])) else: settings['sigma_in'] = 'array' # Recurrent noise if np.isscalar(self.p['var_rec']): settings['sigma_rec'] = '{}'.format(np.sqrt(self.p['var_rec'])) else: settings['sigma_rec'] = 'array' # Dataset settings settings['rectify inputs'] = self.p['rectify_inputs'] settings['gradient minibatch size'] = gradient_data.minibatch_size settings['validation minibatch size'] = validation_data.minibatch_size #--------------------------------------------------------------------------------- # Other settings #--------------------------------------------------------------------------------- settings['dt'] = '{} ms'.format(self.p['dt']) if np.isscalar(self.p['tau']): settings['tau'] = '{} ms'.format(self.p['tau']) else: settings['tau'] = 'custom' settings['tau_in'] = '{} ms'.format(self.p['tau_in']) settings['learning rate'] = '{}'.format(self.p['learning_rate']) settings['lambda_Omega'] = '{}'.format(self.p['lambda_Omega']) settings['max gradient norm'] = '{}'.format(self.p['max_gradient_norm']) #--------------------------------------------------------------------------------- # A few important Theano settings #--------------------------------------------------------------------------------- settings['(Theano) floatX'] = self.floatX settings['(Theano) allow_gc'] = theano.config.allow_gc #--------------------------------------------------------------------------------- # Train! #--------------------------------------------------------------------------------- print_settings(settings) sgd = SGD(trainables, inputs, costs, regs, x, z, self.p, save_values, {'Wrec_': Wrec_, 'd_f_hidden': d_f_hidden}) sgd.train(gradient_data, validation_data, savefile)
# PARAMETERS memory_cell_size = 10 memory_cell_count = 2 cell1 = LSTMCell(x, input_dim, memory_cell_size) cell2 = LSTMCell(x, input_dim, memory_cell_size) # hidden to output wy = theano.shared(value=init_array((memory_cell_size*memory_cell_count, output_dim)), name='wy') by = theano.shared(value=init_array(output_dim), name='by') h1, c1 = cell1.forward_pass() h2, c2 = cell2.forward_pass() prediction = T.nnet.sigmoid(T.sum(T.dot(T.concatenate([h1, h2], axis=1), wy) + by, axis=0)) nll = T.mean(T.nnet.binary_crossentropy(prediction, y)) params = cell1.params + cell2.params + [wy, by] # training lr = 0.005 dparams = T.grad(nll, params) updates = OrderedDict({p: (p - lr*dp) for p, dp in zip(params, dparams)}) train = theano.function(inputs=[x, y], outputs=nll, updates=updates) test = theano.function(inputs=[x, y], outputs=nll) predict = theano.function(inputs=[x], outputs=prediction) # number of training epochs, i.e., passes over training set. epoch_count = 10
def main(n=6, num_epochs=30, model=None, **kwargs): """ Args: **kwargs: - path: direct path to CIFAR-10 or TinyImageNet - data: "cifar-10" or "tiny-image-net" - type: 'resnet' or 'resfuse' or 'resfuse-max' """ # Unpack keyword arguments path = kwargs.pop('path', './cifar-10-batches-py') data_name = kwargs.pop('data', 'cifar-10') model_type = kwargs.pop('type', 'resfuse') # Check if cifar data exists if not os.path.exists(path): print( "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'." ) print("Or download Tiny-imagenet-A :)") return # Load the dataset print("Loading data...") data = None if data_name == 'cifar-10': data = load_data() elif data_name == 'tiny-image-net': sub_sample = kwargs.pop('subsample', 0.1) data = load_tiny_imagenet(path, sub_sample=sub_sample, subtract_mean=True, dtype=theano.config.floatX) data['X_test'] = data['X_val'] data['Y_test'] = data['y_val'] data['Y_train'] = data['y_train'] X_train = data['X_train'] Y_train = data['Y_train'] X_test = data['X_test'] Y_test = data['Y_test'] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model print("Building model and compiling functions...") if model_type == 'resnet': # 'resnet' or 'resfuse' or 'resfuse-max' # network = build_cnn(input_var, n) network = build_resfuse_net(input_var, projection=False) print("ResNet") elif model_type == 'resfuse': network = build_resfuse_net(input_var, projection=True) print("ResFuse Net") elif model_type == 'highway': network = build_highway_net(input_var) print("Highway Net") else: raise ValueError( "model type must be from resnet, resfuse, resfuse-max") print("number of parameters in model: %d" % lasagne.layers.count_params(network, trainable=True)) if model is None: # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) loss = loss.mean() # add weight decay all_layers = lasagne.layers.get_all_layers(network) l2_penalty = lasagne.regularization.regularize_layer_params( all_layers, lasagne.regularization.l2) * 0.0001 loss = loss + l2_penalty # Create update expressions for training # Stochastic Gradient Descent (SGD) with momentum params = lasagne.layers.get_all_params(network, trainable=True) lr = 0.1 sh_lr = theano.shared(lasagne.utils.floatX(lr)) updates = lasagne.updates.momentum(loss, params, learning_rate=sh_lr, momentum=0.9) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Create a loss expression for validation/testing test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_prediction]) if model is None: # launch the training loop print("Starting training...") training_start_time = time.time() best_val_acc = 0.0 # We iterate over epochs: for epoch in range(num_epochs): # shuffle training data train_indices = np.arange(X_train.shape[0]) np.random.shuffle(train_indices) X_train = X_train[train_indices, :, :, :] Y_train = Y_train[train_indices] # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, Y_train, 128, shuffle=True, augment=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 top5accuracy = 0.0 for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False): inputs, targets = batch err, acc, test_prediction = val_fn(inputs, targets) top5accuracy += topKAccuracy(test_prediction, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}m".format( epoch + 1, num_epochs, (time.time() - start_time) / 60.0)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) print(" top 5 validation accuracy:\t\t{:.2f} %".format( top5accuracy / val_batches * 100)) # adjust learning rate as in paper # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs if (epoch + 1) == 40 or (epoch + 1) == 70: new_lr = sh_lr.get_value() * 0.1 print("New LR:" + str(new_lr)) sh_lr.set_value(lasagne.utils.floatX(new_lr)) # decay learning rate when a plateau is hit # when overall validation acc becomes negative or increases smaller than 0.01 # we decay learning rate by 0.8 # if (val_acc / val_batches) - best_val_acc <= 0.005: # new_lr = sh_lr.get_value() * 0.995 # print("New LR:" + str(new_lr)) # sh_lr.set_value(lasagne.utils.floatX(new_lr)) if (val_acc / val_batches) > best_val_acc: best_val_acc = val_acc / val_batches # print out total training time print("Total training time: {:.3f}m".format( (time.time() - training_start_time) / 60.0)) # dump the network weights to a file : npz_file_name = '' if data_name == 'cifar-10': npz_file_name = 'cifar10_deep_residual_model.npz' else: npz_file_name = 'tiny_imagen_a_epochs_' + str(num_epochs) + '_n_' + str(n) + "_" \ + time_string() + "_model.npz" np.savez(npz_file_name, *lasagne.layers.get_all_param_values(network)) else: # load network weights from model file with np.load(model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values) # Calculate validation error of model: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, Y_test, 128, shuffle=False): inputs, targets = batch err, acc, predictions = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) returns_var = TT.vector('returns') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * returns_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function(inputs=[observations_var, actions_var, returns_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for _ in xrange(n_itr): paths = []