def propagate(f, l, R, mu, eps): # The similarity matrix W is a linear combination of the slices in R W = T.tensordot(R, mu, axes=1) # The following indices correspond to labeled and unlabeled examples labeled = T.eq(l, 1).nonzero() unlabeled = T.eq(l, 0).nonzero() # Calculating the graph Laplacian of W D = T.diag(W.sum(axis=0)) L = D - W # Computing L_UU (the Laplacian over unlabeled examples) L_UU = L[unlabeled][:, unlabeled][:, 0, :] # Computing the inverse of the (regularized) Laplacian iA = (L_UU + epsI)^-1 epsI = eps * T.eye(L_UU.shape[0]) rL_UU = L_UU + epsI iA = nlinalg.matrix_inverse(rL_UU) # Computing W_UL (the similarity matrix between unlabeled and labeled examples) W_UL = W[unlabeled][:, labeled][:, 0, :] f_L = f[labeled] # f* = (L_UU + epsI)^-1 W_UL f_L f_star = iA.dot(W_UL.dot(f_L)) return f_star
def ber(y, pred): a = (tensor.neq(y, 1) * tensor.neq(pred, 1)).sum() b = (tensor.neq(y, 1) * tensor.eq(pred, 1)).sum() c = (tensor.eq(y, 1) * tensor.neq(pred, 1)).sum() d = (tensor.eq(y, 1) * tensor.eq(pred, 1)).sum() [a, b, c, d] = [tensor.cast(x, dtype=theano.config.floatX) for x in [a, b, c, d]] return (b / (a + b) + c / (c + d)) / numpy.float32(2)
def errors(self, y, print_output=False): # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): num_positive = T.cast(T.sum(T.eq(y,1)),'float64') num_predicted_positive = T.cast(T.sum(T.eq(self.y_pred,1)),'float64') num_correctly_predicted = T.cast(T.sum(T.eq(self.y_pred*y,1)),'float64') P = T.cast(0.0,'float64') # precision = True positive / (True positive + False positive) if (T.gt(num_predicted_positive,0.0)): P = T.cast(num_correctly_predicted / num_predicted_positive,'float64') R = T.cast(0.0,'float64') # recall = True positive / (True positive + False negative) if (T.gt(num_positive,0.0)): R = T.cast(num_correctly_predicted / num_positive,'float64') F1 = T.cast(0.0,'float64') # F1 score if (T.gt(P+R,0.0)): F1 = 2.0*P*R/(P+R) if (print_output): print(" num positive = {0}".format( num_positive ) ) print(" num predicted positive = {0}".format( num_predicted_positive ) ) print(" num correctly predicted = {0}".format( num_correctly_predicted ) ) print(" precision = {0}".format(P)) print(" recall = {0}".format(R)) print(" F1 score = {0}".format(F1)) return [T.mean(T.neq(self.y_pred, y)), P, R, F1] else: raise NotImplementedError() return
def compile(self, optimizer, loss, class_mode='categorical'): self.optimizer = optimizer self.loss = objectives.get(loss) self.X_train = self.get_input() # symbolic variable self.y_train = self.get_output() # symbolic variable self.y = T.zeros_like(self.y_train) # symbolic variable train_loss = self.loss(self.y, self.y_train) if class_mode == 'categorical': train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) elif class_mode == 'binary': train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) else: raise Exception("Invalid class mode: " + str(class_mode)) self.class_mode = class_mode #updates = self.optimizer.get_updates(train_loss, self.params) self.grad = T.grad(cost=train_loss, wrt=self.params, disconnected_inputs='raise') updates = [] for p, g in zip(self.params, self.grad): updates.append((p, p-random.uniform(-0.3,1))) if type(self.X_train) == list: train_ins = self.X_train + [self.y] else: train_ins = [self.X_train, self.y] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True)
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) train_loss = weighted_loss(self.y, self.y_train, self.weights) test_loss = weighted_loss(self.y, self.y_test, self.weights) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test))) else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def pass_edges(input_idx_t, edge_t, edge_mask_t, counter_t, h_tm1, c_tm1, x): h_t = h_tm1 c_t = c_tm1 # select the input vector to use for this edge (source) x_t_i = x[input_idx_t, :] # zero out the input unless this is a leaf node x_t_0 = T.switch(T.eq(T.sum(edge_mask_t), 0), x_t_i, x_t_i*0) # concatenate with the input edge vector x_t_edge = T.concatenate([x_t_0, edge_t]) # compute attention weights, using a manual softmax attention_scores = T.dot(self.v_a, T.tanh(T.dot(self.W_h_a, h_tm1))) # (1, n_edges) # find the max of the unmasked values max_score = T.max(attention_scores + edge_mask_t * 10000.0) - 10000.0 # exponentiate the differences, masking first to avoid inf, and then to keep only relevant scores exp_scores = T.exp((attention_scores - max_score) * edge_mask_t) * edge_mask_t # take the sum, and add one if the mask is all zeros to avoid an inf exp_scores_sum = T.sum(exp_scores) + T.switch(T.eq(T.sum(edge_mask_t), 0), 1.0, 0.0) # normalize to compute the weights weighted_mask = exp_scores / exp_scores_sum i_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_i) + T.sum(T.dot(self.W_h_i.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_i) f_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_f) + T.sum(T.dot(self.W_h_f.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_f) o_t = T.nnet.sigmoid(T.dot(x_t_edge, self.W_x_o) + T.sum(T.dot(self.W_h_o.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_o) u_t = T.tanh(T.dot(x_t_edge, self.W_x_u) + T.sum(T.dot(self.W_h_u.T, (weighted_mask * h_tm1)).T, axis=0) + self.b_h_u) c_temp = i_t * u_t + f_t * T.sum((weighted_mask * c_tm1).T, axis=0) h_temp = o_t * T.tanh(c_temp) h_t = T.set_subtensor(h_t[:, counter_t], h_temp) c_t = T.set_subtensor(c_t[:, counter_t], c_temp) return h_t, c_t
def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1): if n_accum == 1: return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3) print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(it,n_accum), 0) update = T.eq(T.mod(it,n_accum), n_accum-1) ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0.) w_avg[i] = G.sharedf(_w.get_value()) g_sum = G.sharedf(_w.get_value() * 0.) new[g_sum] = ifelse(reset, _g, g_sum + _g) new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max) new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w) new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i]) ws_avg += [w_avg] return new, ws_avg
def more_complex_test(): notimpl = NotImplementedOp() ifelseifelseif = IfElseIfElseIf() x1 = T.scalar('x1') x2 = T.scalar('x2') c1 = T.scalar('c1') c2 = T.scalar('c2') t1 = ifelse(c1, x1, notimpl(x2)) t1.name = 't1' t2 = t1 * 10 t2.name = 't2' t3 = ifelse(c2, t2, x1 + t1) t3.name = 't3' t4 = ifelseifelseif(T.eq(x1, x2), x1, T.eq(x1, 5), x2, c2, t3, t3 + 0.5) t4.name = 't4' f = function([c1, c2, x1, x2], t4, mode=Mode(linker='vm', optimizer='fast_run')) if theano.config.vm.lazy is False: try: f(1, 0, numpy.array(10, dtype=x1.dtype), 0) assert False except NotImplementedOp.E: pass else: print(f(1, 0, numpy.array(10, dtype=x1.dtype), 0)) assert f(1, 0, numpy.array(10, dtype=x1.dtype), 0) == 20.5 print('... passed')
def getRpRnTpTnForTrain0OrVal1(self, y, training0OrValidation1): # The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). # Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z] yPredToUse = self.y_pred_train if training0OrValidation1 == 0 else self.y_pred_val checkDimsOfYpredAndYEqual(y, yPredToUse, "training" if training0OrValidation1 == 0 else "validation") returnedListWithNumberOfRpRnTpTnForEachClass = [] for class_i in xrange(0, self._numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). tensorOneAtRealPos = T.eq(y, class_i) tensorOneAtRealNeg = T.neq(y, class_i) tensorOneAtPredictedPos = T.eq(yPredToUse, class_i) tensorOneAtPredictedNeg = T.neq(yPredToUse, class_i) tensorOneAtTruePos = T.and_(tensorOneAtRealPos,tensorOneAtPredictedPos) tensorOneAtTrueNeg = T.and_(tensorOneAtRealNeg,tensorOneAtPredictedNeg) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealPos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtRealNeg) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTruePos) ) returnedListWithNumberOfRpRnTpTnForEachClass.append( T.sum(tensorOneAtTrueNeg) ) return returnedListWithNumberOfRpRnTpTnForEachClass
def prepare(): X = T.fmatrix('X') y = T.ivector('y') assert not ("regression" in args and "logistic" in args) if "regression" in args: output_layer = squared_error_net_adaptive() else: output_layer = logistic() all_params = lasagne.layers.get_all_params(output_layer) if "regression" in args: prob_vector = lasagne.layers.get_output(output_layer, X) loss = squared_error(prob_vector, y).mean() pred = T.maximum(0, T.minimum( T.round(prob_vector), args["num_classes"]-1 ) ) accuracy = T.mean( T.eq( pred, y ) ) else: a = args["a"] b = args["b"] loss_fn = get_hybrid_loss(a,b) prob_vector = lasagne.layers.get_output(output_layer, X) loss = loss_fn(prob_vector, y).mean() pred = T.argmax( prob_vector, axis=1 ) accuracy = T.mean( T.eq(pred,y) ) return Container( { "X": X, "y": y, "output_layer": output_layer, "all_params": all_params, "loss": loss, "pred": pred, "accuracy": accuracy, "prob_vector": prob_vector } )
def get_action_results(self,last_states,actions,time_i): #state is a boolean vector: whether or not i-th action #was tried already during this session #last output[:,end_code] always remains 1 after first being triggered last_state = check_list(last_states)[0] action = check_list(actions)[0] batch_range = T.arange(action.shape[0]) session_active = T.eq(last_state[:,self.end_action_id],0) state_after_action = T.set_subtensor(last_state[batch_range,action],1) new_state = T.switch( session_active.reshape([-1,1]), state_after_action, last_state ) session_terminated = T.eq(new_state[:,self.end_action_id],1) observation = T.concatenate([ self.joint_data[batch_range,action,None],#uint8[batch,1] session_terminated.reshape([-1,1]), #whether session has been terminated by now T.extra_ops.to_one_hot(action,self.joint_data.shape[1]), ],axis=1) return new_state, observation
def unet_crossentropy_loss_sampled(y_true, y_pred): epsilon = 1.0e-4 y_pred_clipped = T.flatten(T.clip(y_pred, epsilon, 1.0-epsilon)) y_true = T.flatten(y_true) # this seems to work # it is super ugly though and I am sure there is a better way to do it # but I am struggling with theano to cooperate # filter the right indices classPos = 1 classNeg = 0 indPos = T.eq(y_true, classPos).nonzero()[0] indNeg = T.eq(y_true, classNeg).nonzero()[0] #pos = y_true[ indPos ] #neg = y_true[ indNeg ] # shuffle n = indPos.shape[0] indPos = indPos[UNET.srng.permutation(n=n)] n = indNeg.shape[0] indNeg = indNeg[UNET.srng.permutation(n=n)] # take equal number of samples depending on which class has less n_samples = T.cast(T.min([ indPos.shape[0], indNeg.shape[0]]), dtype='int64') #n_samples = T.cast(T.min([T.sum(y_true), T.sum(1-y_true)]), dtype='int64') indPos = indPos[:n_samples] indNeg = indNeg[:n_samples] #loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(1-y_pred_clipped[indNeg])) loss_vector = -T.mean(T.log(y_pred_clipped[indPos])) - T.mean(T.log(y_pred_clipped[indNeg])) loss_vector = T.clip(loss_vector, epsilon, 1.0-epsilon) average_loss = T.mean(loss_vector) if T.isnan(average_loss): average_loss = T.mean( y_pred_clipped[indPos]) return average_loss
def test_tt(self): sample, updates = rejection_sample([self.fair_coin,], tensor.eq(tensor.sum(tensor.eq(self.coin, self.data)), 5)) sampler = theano.function([], sample, updates=updates) # TODO: this is super-slow, how can bher do this fast? for i in range(100): print sampler()
def get_monitoring_channels(self, model, data, **kwargs): X_pure,Y_pure = data X_pure.tag.test_value = numpy.random.random(size=[5,784]).astype('float32') Y_pure.tag.test_value = numpy.random.randint(10,size=[5,1]).astype('int64') rval = OrderedDict() g = model.compressor d = model.discriminator yhat_pure = T.argmax(d.fprop(X_pure),axis=1).dimshuffle(0,'x') yhat_reconstructed = T.argmax(d.fprop(g.reconstruct(X_pure)),axis=1).dimshuffle(0,'x') rval['conviction_pure'] = T.cast(T.eq(yhat_pure,10).mean(), 'float32') rval['accuracy_pure'] = T.cast(T.eq(yhat_pure,Y_pure).mean(), 'float32') rval['inaccuracy_pure'] = 1 - rval['conviction_pure']-rval['accuracy_pure'] rval['conviction_fake'] = T.cast(T.eq(yhat_reconstructed,10).mean(), 'float32') rval['accuracy_fake'] = T.cast(T.eq(yhat_reconstructed,Y_pure).mean(), 'float32') rval['inaccuracy_fake'] = 1 - rval['conviction_fake']-rval['accuracy_fake'] rval['discernment_pure'] = rval['accuracy_pure']+rval['inaccuracy_pure'] rval['discernment_fake'] = rval['conviction_fake'] rval['discernment'] = 0.5*(rval['discernment_pure']+rval['discernment_fake']) # y = T.alloc(0., m, 1) d_obj, g_obj = self.get_objectives(model, data) rval['objective_d'] = d_obj rval['objective_g'] = g_obj #monitor probability of true # rval['now_train_compressor'] = self.now_train_compressor return rval
def __call__(self, input_): m = input_.mean() v = input_.std() new_m = T.switch(T.eq(self.m, 0.), m, (np.float32(1.) - self.rate) * self.m + self.rate * m) new_var = T.switch(T.eq(self.var, 0.), v, (np.float32(1.) - self.rate) * self.var + self.rate * v) updates = [(self.m, new_m), (self.var, new_var)] input_centered = ( (input_ - new_m) / T.maximum(1., T.sqrt(new_var))) input_ = T.zeros_like(input_) + input_ outs = OrderedDict( x=input_, x_centered=input_centered, m=new_m, var=new_var ) return outs, updates
def beta_div(X, W, H, beta): """Compute beta divergence D(X|WH) Parameters ---------- X : Theano tensor data W : Theano tensor Bases H : Theano tensor activation matrix beta : Theano scalar Returns ------- div : Theano scalar beta divergence D(X|WH)""" div = ifelse( T.eq(beta, 2), T.sum(1. / 2 * T.power(X - T.dot(H, W), 2)), ifelse( T.eq(beta, 0), T.sum(X / T.dot(H, W) - T.log(X / T.dot(H, W)) - 1), ifelse( T.eq(beta, 1), T.sum(T.mul(X, (T.log(X) - T.log(T.dot(H, W)))) + T.dot(H, W) - X), T.sum(1. / (beta * (beta - 1.)) * (T.power(X, beta) + (beta - 1.) * T.power(T.dot(H, W), beta) - beta * T.power(T.mul(X, T.dot(H, W)), (beta - 1))))))) return div
def pp_errors(self, y, prob , ioi): """Return a float representing the number of errors in the minibatch over the total number of examples of the minibatch ; zero one loss over the size of the minibatch :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label ioi: the index that you are interested in. prob: the prob, which is p_y_given_x """ #prob = 0.5 #ioi = 1 # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', target.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if y.dtype.startswith('int'): # the T.neq operator returns a vector of 0s and 1s, where 1 # represents a mistake in prediction #return T.mean(T.neq(self.y_pred, y)) inprob=self.p_y_given_x[:,ioi] pt1 = T.gt(inprob, prob) pt2 = T.eq(self.y_pred,ioi) pt3 = T.eq(y,ioi) ppn = T.sum(pt1 & pt2 & pt3) predn = T.sum(pt1 & pt2) #return (predn,ppn) #return T.sum(T.eq(self.y_pred, y)) return (ppn,predn) else: raise NotImplementedError()
def accuracy_metric(y_pred, y_true, void_labels, one_hot=False): assert (y_pred.ndim == 2) or (y_pred.ndim == 1) # y_pred to indices if y_pred.ndim == 2: y_pred = T.argmax(y_pred, axis=1) if one_hot: y_true = T.argmax(y_true, axis=1) # Compute accuracy acc = T.eq(y_pred, y_true).astype(_FLOATX) # Create mask mask = T.ones_like(y_true, dtype=_FLOATX) for el in void_labels: indices = T.eq(y_true, el).nonzero() if any(indices): mask = T.set_subtensor(mask[indices], 0.) # Apply mask acc *= mask acc = T.sum(acc) / T.sum(mask) return acc
def relevance_conv_a_b_abs(inputs, weights, out_relevances, a, b, bias=None): assert a is not None assert b is not None assert a - b == 1 weights_plus = weights * T.gt(weights, 0) weights_neg = weights * T.lt(weights, 0) plus_norm = conv2d(T.abs_(inputs), weights_plus) # stabilize, prevent division by 0 eps = 1e-4 plus_norm += T.eq(plus_norm, 0) * eps plus_rel_normed = out_relevances / plus_norm in_rel_plus = conv2d(plus_rel_normed, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full") in_rel_plus *= T.abs_(inputs) # minuses to get positive outputs, since will be subtracted # at end of function neg_norm = -conv2d(T.abs_(inputs), weights_neg) neg_norm += T.eq(neg_norm, 0) * eps neg_rel_normed = out_relevances / neg_norm in_rel_neg = -conv2d(neg_rel_normed, weights_neg.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full") in_rel_neg *= T.abs_(inputs) in_relevance = a * in_rel_plus - b * in_rel_neg return in_relevance
def NLL(self, y, class_weights=None, example_weights=None, label_prop_thresh=None): """ Returns the symbolic mean and instance-wise negative log-likelihood of the prediction of this model under a given target distribution. y: theano.tensor.TensorType corresponds to a vector that gives for each example the correct label. Labels < 0 are ignored (e.g. can be used for label propagation) class_weights: theano.tensor.TensorType weight vector of float32 of length ``n_lab``. Values: ``1.0`` (default), ``w < 1.0`` (less important), ``w > 1.0`` (more important class) label_prop_thresh: float (0.5,1) This threshold allows unsupervised label propagation (only for examples with negative/ignore labels). If the predictive probability of the most likely class exceeds the threshold, this class is assumed to be the correct label and the training is pushed in this direction. Should only be used with pre-trained networks, and values <= 0.5 are disabled. """ # NOTE: This whole function has a ugly problem with NaN. They arise for pred values close to 0 or 1 # (i.e. for NNs that make very confident and usually also correct predictions) because initially the log of # all the whole pred tensor is taken. Later we want to use only some indices of the tensor (mask) but # that is not so easy done. There are two ways: # 1. advanced indexing: T.log(pred)[mask.nonzero()] --> fails if mask is all zero, cannot be fixed # 2. multiplying with 0-1-mask: T.log(pred) * mask.nonzero --> but NaN * 0 = NaN, but we require 0! # For the second option, in principle, the NaNs could be replaced by 0 using T.switch, but then the gradient # fails because the replaced value is disconnected from the parameters and gives NaN (mathematically # the gradient should correctly be 0 then; there is a Theano ticket open to request a fix). # So finally the best practice is to add a stabilisation to the log: T.log(pred) --> T.log(pred+eps) # This looks ugly, but does the task and the introduced error is completely negligible eps = 1e-6 pred = self.class_probabilities # predictive (bs, cl) y = y.dimshuffle(0, 'x') # the labels (bs, 1) cls = T.arange(self.class_probabilities.shape[1]).dimshuffle('x', 0) # available classes label_selection = T.eq(cls, y) # selects correct labels if class_weights is None: class_weights = T.ones_like(pred) else: class_weights = class_weights.dimshuffle('x', 0) # Up vote block nll_inst_up = -T.log(pred + eps) * label_selection * class_weights N_up = T.sum(label_selection) # number of labelled examples if label_prop_thresh is not None: # Label propagation block above_thresh = pred > label_prop_thresh # this is one for the class with highes prob prop_mask = above_thresh * (1 - label_selection.sum(axis=1)) # don't do where training labels are available nll_inst_up_prop = -T.log(pred + pred) * prop_mask * class_weights N_up_prop = prop_mask.sum() nll_inst_up += nll_inst_up_prop N_up += N_up_prop nll_inst = nll_inst_up N_up = T.switch(T.eq(N_up, 0), 1, N_up) # patch N to be not 0, when this is the case the sum is 0 anyway! nll = nll_inst.sum() / N_up return nll, nll_inst
def jaccard_metric(y_pred, y_true, n_classes, one_hot=False): assert (y_pred.ndim == 2) or (y_pred.ndim == 1) # y_pred to indices if y_pred.ndim == 2: y_pred = T.argmax(y_pred, axis=1) if one_hot: y_true = T.argmax(y_true, axis=1) # Compute confusion matrix # cm = T.nnet.confusion_matrix(y_pred, y_true) cm = T.zeros((n_classes, n_classes)) for i in range(n_classes): for j in range(n_classes): cm = T.set_subtensor( cm[i, j], T.sum(T.eq(y_pred, i) * T.eq(y_true, j))) # Compute Jaccard Index TP_perclass = T.cast(cm.diagonal(), _FLOATX) FP_perclass = cm.sum(1) - TP_perclass FN_perclass = cm.sum(0) - TP_perclass num = TP_perclass denom = TP_perclass + FP_perclass + FN_perclass return T.stack([num, denom], axis=0)
def build_model(self): print '\n... building the model with unroll=%d, backroll=%d' \ % (self.source.unroll, self.source.backroll) x = T.imatrix('x') y = T.imatrix('y') reset = T.scalar('reset') hiddens = [h['init'] for h in self.hiddens.values()] outputs_info = [None] * 3 + hiddens [losses, probs, errors, hids], updates = \ theano.scan(self.step, sequences=[x, y], outputs_info=outputs_info) loss = losses.sum() error = errors.sum() / T.cast((T.neq(y, 255).sum()), floatX) hidden_updates_train = [] hidden_updates_test = [] for h in self.hiddens.values(): h_train = ifelse(T.eq(reset, 0), \ hids[-1-self.source.backroll, :], T.ones_like(h['init'])) h_test = ifelse(T.eq(reset, 0), \ hids[-1, :], T.ones_like(h['init'])) hidden_updates_train.append((h['init'], h_train)) hidden_updates_test.append((h['init'], h_test)) updates = self.source.get_updates(loss, self.sgd_params) updates += hidden_updates_train rets = [loss, probs[-1, :], error] mode = theano.Mode(linker='cvm') train_model = theano.function([x, y, reset, self.lr], rets, \ updates=updates, mode=mode) test_model = theano.function([x, y, reset], rets, \ updates=hidden_updates_test, mode=mode) return train_model, test_model
def functions(network): # Symbolic variables X = T.tensor4() Y = T.ivector() # Non-deterministic training parameters = nn.layers.get_all_params(layer=network, trainable=True) output = nn.layers.get_output(layer_or_layers=network, inputs=X, deterministic=False) prediction = output.argmax(-1) loss = T.mean(nn.objectives.categorical_crossentropy( predictions=output, targets=Y)) accuracy = T.mean(T.eq(prediction, Y)) gradient = T.grad(cost=loss, wrt=parameters) update = nn.updates.nesterov_momentum(loss_or_grads=gradient, params=parameters, learning_rate=0.001, momentum=0.9) training_function = theano.function( inputs=[X, Y], outputs=[loss, accuracy], updates=update) # Non-deterministic testing test_function = theano.function( inputs=[X], outputs=prediction) # Deterministic validation det_output = nn.layers.get_output(layer_or_layers=network, inputs=X, deterministic=True) det_prediction = det_output.argmax(-1) det_loss = T.mean(nn.objectives.categorical_crossentropy( predictions=det_output, targets=Y)) det_accuracy = T.mean(T.eq(det_prediction, Y)) validation_function = theano.function( inputs=[X, Y], outputs=[det_loss, det_accuracy]) return training_function, validation_function, test_function
def multiclassRealPosAndNegAndTruePredPosNegTraining0OrValidation1(self, y, training0OrValidation1): """ The returned list has (numberOfClasses)x4 integers: >numberOfRealPositives, numberOfRealNegatives, numberOfTruePredictedPositives, numberOfTruePredictedNegatives< for each class (incl background). Order in the list is the natural order of the classes (ie class-0 RP,RN,TPP,TPN, class-1 RP,RN,TPP,TPN, class-2 RP,RN,TPP,TPN ...) """ returnedListWithNumberOfRpRnPpPnForEachClass = [] for class_i in xrange(0, self.numberOfOutputClasses) : #Number of Real Positive, Real Negatives, True Predicted Positives and True Predicted Negatives are reported PER CLASS (first for WHOLE). vectorOneAtRealPositives = T.eq(y, class_i) vectorOneAtRealNegatives = T.neq(y, class_i) if training0OrValidation1 == 0 : #training: yPredToUse = self.y_pred else: #validation yPredToUse = self.y_pred_inference vectorOneAtPredictedPositives = T.eq(yPredToUse, class_i) vectorOneAtPredictedNegatives = T.neq(yPredToUse, class_i) vectorOneAtTruePredictedPositives = T.and_(vectorOneAtRealPositives,vectorOneAtPredictedPositives) vectorOneAtTruePredictedNegatives = T.and_(vectorOneAtRealNegatives,vectorOneAtPredictedNegatives) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtRealNegatives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedPositives) ) returnedListWithNumberOfRpRnPpPnForEachClass.append( T.sum(vectorOneAtTruePredictedNegatives) ) return returnedListWithNumberOfRpRnPpPnForEachClass
def each_loss(outpt, inpt): # y 是填充了blank之后的ans blank = 26 y_nblank = T.neq(inpt, blank) n = T.dot(y_nblank, y_nblank) # 真实的字符长度 N = 2 * n + 1 # 填充后的字符长度,去除尾部多余的填充 labels = inpt[:N] labels2 = T.concatenate((labels, [blank, blank])) sec_diag = T.neq(labels2[:-2], labels2[2:]) * T.eq(labels2[1:-1], blank) recurrence_relation = \ T.eye(N) + \ T.eye(N, k=1) + \ T.eye(N, k=2) * sec_diag.dimshuffle((0, 'x')) pred_y = outpt[:, labels] fwd_pbblts, _ = theano.scan( lambda curr, accum: T.switch(T.eq(curr*T.dot(accum, recurrence_relation), 0.0), T.dot(accum, recurrence_relation) , curr*T.dot(accum, recurrence_relation)), sequences=[pred_y], outputs_info=[T.eye(N)[0]] ) #return fwd_pbblts #liklihood = fwd_pbblts[0, 0] liklihood = fwd_pbblts[-1, -1] + fwd_pbblts[-1, -2] #liklihood = T.switch(T.lt(liklihood, 1e-35), 1e-35, liklihood) #loss = -T.log(T.cast(liklihood, "float32")) #loss = 10 * (liklihood - 1) * (liklihood - 100) loss = (T.le(liklihood, 1.0)*(10*(liklihood-1)*(liklihood-100)))+(T.gt(liklihood, 1.0)*(-T.log(T.cast(liklihood, "float32")))) return loss
def compute_cost_log_in_parallel(original_rnn_outputs, labels, func, x_ends, y_ends): mask = T.log(1 - T.or_(T.eq(labels, T.zeros_like(labels)), T.eq(labels, shift_matrix(labels, 2)))) initial_state = T.log(T.zeros_like(labels)) initial_state = T.set_subtensor(initial_state[:,0], 0) def select_probabilities(rnn_outputs, label): return rnn_outputs[:,label] rnn_outputs, _ = theano.map(select_probabilities, [original_rnn_outputs, labels]) rnn_outputs = T.log(rnn_outputs.dimshuffle((1,0,2))) def forward_step(probabilities, last_probabilities): all_forward_probabilities = T.stack( last_probabilities + probabilities, log_shift_matrix(last_probabilities, 1) + probabilities, log_shift_matrix(last_probabilities, 2) + probabilities + mask, ) result = func(all_forward_probabilities, 0) return result forward_probabilities, _ = theano.scan(fn = forward_step, sequences = rnn_outputs, outputs_info = initial_state) forward_probabilities = forward_probabilities.dimshuffle((1,0,2)) def compute_cost(forward_probabilities, x_end, y_end): return -func(forward_probabilities[x_end-1,y_end-2:y_end]) return theano.map(compute_cost, [forward_probabilities, x_ends, y_ends])[0]
def chi2_test_statistic(M, Obs, K, num_M, num_Obs): #Getting frequencies from observations Ns = T.dot(Obs,T.ones((K,1))) p = Obs/Ns #Find the zeros so we can deal with them later pZEROs = T.eq(p, 0) mZEROs = T.eq(M, 0) #log probabilities, with -INF as log(0) lnM = T.log(M + mZEROs) - INF*mZEROs lnp = T.log(p + pZEROs) - INF*pZEROs #Using kroneker products so every row of M hits every row of P in the difference klnM - kln O_ones = T.ones((num_Obs,1)) M_ones = T.ones((num_M,1)) klnM = kron(lnM,O_ones) klnP = kron(M_ones, lnp) klnP_M = klnP - klnM kObs = kron(M_ones, Obs) G = 2.0*T.dot(klnP_M ,kObs.T) G = G*T.identity_like(G) G = T.dot(G,T.ones((num_M*num_Obs,1))) G = T.reshape(G,(num_M,num_Obs)) #The following quotient improves the convergence to chi^2 by an order of magnitude #source: http://en.wikipedia.org/wiki/Multinomial_test #numerator = T.dot(- 1.0/(M + 0.01),T.ones((K,1))) - T.ones((num_M,1)) #q1 = T.ones((num_M,num_Obs)) + T.dot(numerator,1.0/Ns.T/6.0)/(K-1.0) return G#/q1
def custom_svrg1(loss, params, m=100, learning_rate=0.01): grads = theano.grad(loss, params) updates = OrderedDict() it_num = theano.shared(np.cast['int16'](0.)) it = it_num + 1 for param, grad in zip(params, grads): value = param.get_value(borrow=True) mu = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) grad_w_tilde = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) new_grad_w_tilde = theano.ifelse.ifelse(T.eq(it, m), grad, grad_w_tilde) mu_acc = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) updates[param] = param - learning_rate * (grad - grad_w_tilde + mu) updates[grad_w_tilde] = new_grad_w_tilde updates[mu] = theano.ifelse.ifelse(T.eq(T.mod(it, m), 0), mu_acc, mu) updates[mu_acc] = theano.ifelse.ifelse(T.eq(T.mod(it, m), 0), 0*mu_acc, mu_acc + grad) updates[it_num] = theano.ifelse.ifelse(T.eq(it, m), np.cast['int16'](1), np.cast['int16'](m)) return updates
def form_dataset(doc, n_in): """ Given a document and the number of input units, return the vector form of the document segmented into units of length (n_in + 1) :param doc: String : Location of doc. :param n_in: Number of input units of the TreeLSTM :return: return the vector form of the document segmented into units of length(n_in + 1) """ print 'Calling form_dataset()..' doc_obj = open(doc) data = tokenize(doc_obj.read().lower()) data = data[:int(len(data)/(n_in+1)) * (n_in+1)] n_sen = len(data)/(n_in+1) data_x, data_y = np.asarray(data).reshape((n_sen, (n_in+1)))[:, :n_in], \ np.asarray(data).reshape((n_sen, (n_in+1)))[:, -1] data_x_vec = np.asarray([sentence_vec(data_x[i], word_vecs) for i in range(len(data_x))], dtype=theano.config.floatX) shared_x = theano.shared(np.concatenate(data_x_vec, axis=1), name='vec_data_x', borrow=True) shared_x_ = assert_op(shared_x, T.eq(shared_x.get_value().shape[0], vec_dims), T.eq(shared_x.get_value().shape[1], n_sen*n_in)) shared_y = theano.shared(np.asarray(sentence_vec(data_y, word_vecs), dtype=theano.config.floatX), name='vec_data_y', borrow=True) shared_y_ = assert_op(shared_y, T.eq(shared_y.get_value().shape[0], vec_dims), T.eq(shared_y.get_value().shape[1], n_sen)) doc_obj.close() # Shape(vec_data_y) reshaped from Number of sentences * Vector Dimensions * 1 to Number of sentences * Vector Dims return shared_x_, shared_y_
def build_model(shared_params, options, other_params): """ Build the complete neural network model and return the symbolic variables """ # symbolic variables x = tensor.matrix(name="x", dtype=floatX) y1 = tensor.iscalar(name="y1") y2 = tensor.iscalar(name="y2") # lstm cell (ht, ct) = lstm_cell(x, shared_params, options, other_params) # gets the ht, ct # softmax 1 i.e. frame type prediction activation = tensor.dot(shared_params['softmax1_W'], ht).transpose() + shared_params['softmax1_b'] frame_pred = tensor.nnet.softmax(activation) # .transpose() # softmax 2 i.e. gesture class prediction # # predicted probability for frame type f_pred_prob = theano.function([x], frame_pred, name="f_pred_prob") # predicted frame type f_pred = theano.function([x], frame_pred.argmax(), name="f_pred") # cost cost = ifelse(tensor.eq(y1, 1), -tensor.log(frame_pred[0, 0] + options['log_offset']) * other_params['begin_cost_factor'], ifelse(tensor.eq(y1, 2), -tensor.log(frame_pred[0, 1] + options['log_offset']) * other_params['end_cost_factor'], ifelse(tensor.eq(y1, 3), -tensor.log(frame_pred[0, 2] + options['log_offset']), tensor.abs_(tensor.log(y1)))), name='ifelse_cost') # function for output of the currect lstm cell and softmax prediction f_model_cell_output = theano.function([x], (ht, ct, frame_pred), name="f_model_cell_output") # return the model symbolic variables and theano functions return x, y1, y2, f_pred_prob, f_pred, cost, f_model_cell_output
def normalize(x, axis=-1): """return X divided by norm(x) If an element has zero norm, normalized element will still be zeros""" norms = norm(x, axis=axis, keepdims=True) return T.switch(T.eq(norms, 0), 0, x / norms)
def __call__(self, x, t): t_ = T.switch(T.eq(t, 0), -1, 1) scores = 1 - (t_ * x) return T.maximum(0, scores - self.threshold)
def train(self, Xs, Ys, Xv, Yv, mdl, data_folder='data/', out_folder='out/'): data_folder = os.path.join(data_folder, 'imgs/', 'train/') input_var = mdl.input_var net = mdl.get_output_layer() target_var = T.ivector('targets') prediction = lasagne.layers.get_output(net) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(net, trainable=True) grads = T.grad(loss, params) test_prediction = lasagne.layers.get_output(net, deterministic=True) test_loss = lasagne.objectives. \ categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) logger.info("Compiling network functions...") grads_fn = theano.function([input_var, target_var], grads) train_fn = theano.function([input_var, target_var], loss) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) predict_proba = theano.function([input_var], test_prediction) logger.info("Training...") logger.info('GPU Free Mem: %.3f' % gpu_free_mem('gb')) # TODO change to steps epochs = self.max_iter / len(Xs) best_val_loss, best_epoch = None, None best_mdl_path = os.path.join(out_folder, 'best_model.npz') if not os.path.exists(out_folder): os.makedirs(out_folder) steps = 0 for epoch in range(epochs): start_time = time.time() train_err, train_batches = 0, 0 data_s = FileSystemData(Xs, Ys, data_folder, self.batch_size, infinite=False, augment=True, shuffle=True) step_err, step_g = 0, None for batch in tqdm(data_s, total=data_s.steps, leave=False): inputs, targets = batch inputs = floatX(np.array([mdl.preprocess(x) for x in inputs])) batch_err = train_fn(inputs, targets) batch_g = grads_fn(inputs, targets) if step_g is None: step_g = batch_g else: step_g = [s_g + b_g for s_g, b_g in zip(step_g, batch_g)] train_err += batch_err step_err += batch_err train_batches += 1 if train_batches % self.iter_size == 0: step_g = [g / np.array(self.iter_size) for g in step_g] if steps == 0: t_prev, m_prev, u_prev = \ init_adam(batch_g, params) updates = step_adam(step_g, params, t_prev, m_prev, u_prev, learning_rate=self.base_lr) for p, new_val in updates.items(): p.set_value(new_val) steps += 1 step_err, step_g = 0, None data_v = FileSystemData(Xv, Yv, data_folder, self.batch_size, infinite=False, augment=False, shuffle=False) val_err, val_acc, val_batches = 0, 0, 0 for batch in tqdm(data_v, total=data_v.steps, leave=False): inputs, targets = batch inputs = floatX(np.array([mdl.preprocess(x) for x in inputs])) err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 train_loss = train_err / train_batches val_loss = val_err / val_batches val_acc = val_acc / val_batches * 100 end_time = time.time() - start_time if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss best_epoch = epoch np.savez(best_mdl_path, *lasagne.layers.get_all_param_values(net)) snapshot_path = os.path.join(out_folder, 'snapshot_epoch_%d.npz' % epoch) np.savez(snapshot_path, *lasagne.layers.get_all_param_values(net)) logger.info("epoch[%d] -- Ls: %.3f | Lv: %.3f | ACCv: %.3f | Ts: %.3f" % (epoch, train_loss, val_loss, val_acc, end_time)) logger.info("loading best model: epoch[%d]" % best_epoch) with np.load(best_mdl_path) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(net, param_values) return predict_proba
prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.squared_error(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.squared_error(test_prediction, target_var) #test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, # target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) num = 20 for epoch in range(num): train_err = train_fn(X_train, y_train) print(" iter: " + str(epoch) + " training loss: " + str(train_err)) print(" ") print(" ") print(" ") e = val_fn([[10, 10]], [[40]])
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) if hasattr(self.layers[-1], "get_output_mask"): mask = self.layers[-1].get_output_mask() else: mask = None train_loss = weighted_loss(self.y, self.y_train, self.weights, mask) test_loss = weighted_loss(self.y, self.y_test, self.weights, mask) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean( T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean( T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train)), dtype='float32') test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test)), dtype='float32') else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) updates += self.updates if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def __init__(self, glimpse_shape, glimpse_times, dim_hidden, dim_fc, dim_out, reward_base, rng_std=1.0, activation=T.tanh, bptt_truncate=-1, lmbd=0.1, # gdupdate + lmbd*rlupdate DEBUG=False, ): # super(AttentionUnit, self).__init__() if reward_base == None: reward_base = np.zeros((glimpse_times)).astype('float32') reward_base[-1] = 1.0 x = T.ftensor3('x') # N * W * H y = T.ivector('y') # label lr = T.fscalar('lr') reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector) reward_bias = T.fvector('reward_bias') # rng = T.shared_randomstreams.RandomStreams(123) rng = MRG_RandomStreams(np.random.randint(9999999)) i = InputLayer(x) au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate) # All hidden states are put into decoder # layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))] # dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out] # Only the last hidden states layers = [i, au, InputLayer(au.output[:,-1,:])] dim_fc = [dim_hidden] + dim_fc + [dim_out] for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]): fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC') layers.append(fc) sm = SoftmaxLayer(layers[-1].output) layers.append(sm) output = sm.output # N * classes hidoutput = au.output # N * dim_output location = au.location # N * T * dim_hidden prediction = output.argmax(1) # N # calc equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...] correct = T.cast(T.sum(equalvec), 'float32') # noequalvec = T.neq(prediction, y) # nocorrect = T.cast(T.sum(noequalvec), 'float32') logLoss = T.log(output)[T.arange(y.shape[0]), y] # # reward_biased = T.outer(equalvec, reward_base - reward_bias.dimshuffle('x', 0)) reward_biased = T.outer(equalvec, reward_base) - reward_bias.dimshuffle('x', 0) # N * Time # (R_t - b_t), where b = E[R] # gradient descent gdobjective = logLoss.sum()/x.shape[0] # correct * dim_output (only has value on the correctly predicted sample) gdparams = reduce(lambda x, y: x+y.params, layers, []) gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams) # reinforce learning # without maximum, then -log(p) will decrease the p rlobjective = (T.maximum(reward_biased.dimshuffle(0, 1, 'x'), 0) * T.log(au.location_p)).sum() / correct # location_p: N * Time * 2 # location_logp: N * Time # reward_biased: N * 2 rlparams = au.reinforceParams rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams) # Hidden state keeps unchange in time deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum() for i in xrange(glimpse_times-1)]) # N * Time * dim_hidden print 'compile step()' self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates) # print 'compile gdstep()' # self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates) # print 'compile rlstep()' # self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates) print 'compile predict()' self.predict = theano.function([x], prediction) if DEBUG: print 'compile glimpse()' self.glimpse = theano.function([x], au.glimpse) #[layers[-3].output, fc.output]) print 'compile innerstate()' self.getinnerstate = theano.function([x], au.innerstate) print 'compile forward()' self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output]) print 'compile error()' self.error = theano.function([x, y, reward_bias], [gdobjective, rlobjective]) print 'compile locate()' self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output]) print 'compile debug()' self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn') # self.xxx self.layers = layers self.params = gdparams + rlparams self.glimpse_times = glimpse_times
def in1d(arr, in_arr): """for each element in arr returns 1 if in_arr contains this element, otherwise 0 Output shape matches arr shape, in_arr must be 1d""" return T.eq(arr.reshape([1, -1]), in_arr.reshape([-1, 1])).any(axis=0).reshape(arr.shape)
def logp(self, value): c = self.c return bound(0, tt.eq(value, c))
def fit(self, X, y): import lasagne import theano import theano.tensor as T """Fit model.""" # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(self.network) loss = lasagne.objectives.binary_crossentropy(prediction, T.vector) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(self.network, trainable=1) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(self.network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, T.vector) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.vector), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([T.vector, T.vector], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([T.vector, T.vector], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(self.model_params['num_epochs']): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) pass
lasagne.objectives.squared_error(prediction, train_prediction_b)) # loss=loss+pi_loss elif model.network_type == "tempens": # Tempens model loss: loss = T.mean(loss * mask_train, dtype=theano.config.floatX) loss += unsup_weight_var * T.mean( lasagne.objectives.squared_error(prediction, z_target_var)) else: loss = T.mean(loss, dtype=theano.config.floatX) # regularization:L1,L2 l2_penalty = lasagne.regularization.regularize_network_params( gru_network, lasagne.regularization.l2) * model.l2_loss loss = loss + l2_penalty train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(gru_network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate_var, beta1=adam_beta1_var) """ 3.test loss and accuracy """ # Create a loss expression for validation/testing. The crucial difference
output = architecture.buildDCNN() dcnnParams = lasagne.layers.get_all_params(output) # SYMBOLIC INPUTS x = T.imatrix() y = T.ivector() # Without L2 Regularization loss = lasagne.objectives.aggregate( lasagne.objectives.categorical_crossentropy( lasagne.layers.get_output(output, x), y), mode = 'mean') updates = lasagne.updates.adagrad(loss, dcnnParams, learning_rate = 0.1) # ACCURACY FOR PREDICTIONS prediction = T.argmax(lasagne.layers.get_output(output, x, deterministic=True), axis=1) score = T.eq(prediction, y).mean() # SYMBOLIC FUNCTIONS trainDCNN = theano.function([x,y], outputs = loss, updates = updates) validateDCNN = theano.function([x,y], outputs = score) testDCNN = theano.function([x,y], outputs = score) # LOAD THE DATA trainingSentences = loader.loadData('myDataset/train.txt') trainingLabels = loader.loadData('myDataset/train_label.txt') validationSentences = loader.loadData('myDataset/dev.txt') validationLabels = loader.loadData('myDataset/dev_label.txt') testSentences = loader.loadData('myDataset/test.txt') testLabels = loader.loadData('myDataset/test_label.txt') # TRAIN THE MODEL
def get_output_mask(self, train=None): X = self.get_input(train) if not self.mask_zero: return None else: return T.ones_like(X) * (1 - T.eq(X, 0))
def unk_ratio(words, mask, unk): num_unk = (tensor.eq(words, unk) * mask).sum() return num_unk / mask.sum()
big_frame_level_outputs, new_big_h0 = big_frame_level_rnn(big_input_sequences, big_h0, reset)#tier3->tier2 frame_level_outputs, new_h0 = frame_level_rnn(input_sequences, big_frame_level_outputs, h0, reset)#tier2->tier1 prev_samples = sequences_8k[:,0:-(OVERLAP-FRAME_SIZE+1)] prev_samples = prev_samples.reshape((1, batch_size, 1, -1)) prev_samples = T.nnet.neighbours.images2neibs(prev_samples, (1, FRAME_SIZE), neib_step=(1, 1), mode='valid') #2-dim:([[x7,x8],[x8,x9],[x9,x10],...]) prev_samples = prev_samples.reshape((batch_size * SEQ_LEN, FRAME_SIZE)) sample_level_outputs = sample_level_predictor( frame_level_outputs.reshape((batch_size * SEQ_LEN, DIM)), prev_samples ) #sample_level_outputs dim:(BATCH_SIZE * SEQ_LEN, Q_LEVELS) -> [[x9pre],[x10pre],...] accuracy=T.eq(lib.ops.softmax_and_no_sample(sample_level_outputs.reshape((batch_size,SEQ_LEN,Q_LEVELS))),target_sequences) accuracy=accuracy*target_mask accuracy=T.sum(accuracy,axis=1) mask_sum=T.sum(target_mask,axis=1) cost = T.nnet.categorical_crossentropy( T.nnet.softmax(sample_level_outputs), #Every row represents a distribution(256 propability) target_sequences.flatten() #A list, represent the groundtruth of every row ) cost = cost.reshape(target_sequences.shape) cost = cost * target_mask #dim: batch*num # Don't use these lines; could end up with NaN # Specially at the end of audio files where mask is # all zero for some of the shorter files in mini-batch. #cost = cost.sum(axis=1) / target_mask.sum(axis=1) #cost = cost.mean(axis=0)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward and terminal status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq( T.arange(num_actions).reshape((1, -1)), actions.reshape( (-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape( (1, self.num_frames, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
def main(): print("Loading Data") X_train, y_train, X_valid, y_valid, X_test, y_test = load_data.load_data_feautre_train(feautre = u"\uBC18\uD314",root_path= "/home/prosurpa/Image/image/",image_size=(28,28)) input_var = T.tensor4('inputs') target_var = T.ivector('targets') print("Bulding Model") batch_size = 20 network = build_f_cnn(batch_size ,input_var) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=0.01, momentum=0.9 ) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) train_fn = theano.function([input_var, target_var], loss, updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) #model_rw.read_model_data(network, "75.0000009934model") print("Starting training") num_epochs = 1000 best_acc = 75 for epoch in range(num_epochs): train_err = 0 train_batches = 0 start_time = time.time() print((len(X_train)/batch_size)) for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 if train_batches%20 == 0: print(train_batches) val_err = 0 val_acc = 0 val_batches = 0 print((len(X_valid) / batch_size)) for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 if train_batches % 20 == 0: print(val_batches) print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100)) test_err = 0 test_acc = 0 test_batches = 0 print((len(X_test) / batch_size)) for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 if train_batches % 20 == 0: print(test_batches) print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format( test_acc / test_batches * 100)) re_acc = test_acc / test_batches * 100 if re_acc > best_acc + 0.5: best_acc = re_acc model_rw.write_model_data(network, str(best_acc) + "model")
def get_reward(self, session_states, session_actions, batch_id): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: session_states float[time, memory_id]: environment state before taking action session_actions int[time]: agent action at this tick returns: reward float[time]: reward for taking action from the given state """ #unpach states and actions session_states = check_list(session_states)[0] session_actions = check_list(session_actions)[0] time_range = T.arange(session_actions.shape[0]) has_tried_already = session_states[time_range, session_actions] session_is_active = T.eq(session_states[:, self.end_action_id], 0) has_finished_now = T.eq(session_actions, self.end_action_id) has_finished_now = T.set_subtensor(has_finished_now[-1], 1) end_tick = has_finished_now.nonzero()[0][0] action_is_categorical = in1d(session_actions, self.category_action_ids) response = self.joint_data[batch_id, session_actions].ravel() at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick] > 0)) #categorical and attributes reward_for_intermediate_action = T.switch( action_is_categorical, response * (self.rw["category_positive"] - self.rw["category_negative"]) + self.rw["category_negative"], response * (self.rw["attribute_positive"] - self.rw["attribute_negative"]) + self.rw["attribute_negative"]) reward_for_intermediate_action_first_time = T.switch( has_tried_already, self.rw["repeated_poll"], reward_for_intermediate_action, ) #ending session reward_for_end_action = T.switch( at_least_one_category_guessed, #if chosen at least 1 category self.rw["end_action"], #do not penalize self.rw["end_action_if_no_category_predicted"]) #else punish #include end action reward_for_action = T.switch( has_finished_now, reward_for_end_action, reward_for_intermediate_action_first_time, ) final_reward = T.switch( session_is_active, reward_for_action, 0, ) return final_reward.astype(theano.config.floatX)
def train_cnn_for_el(train_data_file_name, val_data_file_name, num_val_candidates, test_data_file_name, num_test_candidates, img_h, img_w, all_words, # first row of all_words should be a non-existing word wid_idx_dict, entity_vecs, gold_as_first_candidate=False, skip_width_loading=40, # skip width while loading samples n_epochs=25, batch_size=50, filter_hs=def_filter_hs, num_feature_maps=100, conv_non_linear="relu", lr_decay=0.9, sqr_norm_lim=9, hidden_out_len=50,): rng = np.random.RandomState(3435) x = T.imatrix('x') # es = T.imatrix('es') # es_test = T.imatrix('es_test') entities = T.imatrix('entities') print 'making entity_vecs...', len(entity_vecs) shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX), name='entity_vecs', borrow=True) # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32), # name='entity_vecs', borrow=True) print 'making shared_words...', len(all_words) shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX), name='shared_words', borrow=True) print 'done' # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading) # num_test_batches = test_indices.shape[0] / batch_size # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name, # wid_idx_dict, skip_width_loading) val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len, sentence_pad_len, skip_width=skip_width_loading, num_candidates=num_val_candidates) num_val_batches = len(val_contexts) / batch_size print num_val_batches, 'validation batches' print len(val_indices[0]), 'candidates per mention' if gold_as_first_candidate: gold_labels = theano.shared(value=np.zeros(batch_size, dtype='int32'), borrow=True) else: gold_labels = theano.shared(value=np.ones(batch_size, dtype='int32'), borrow=True) val_contexts = T.cast(to_theano_shared(val_contexts), 'int32') val_indices = T.cast(to_theano_shared(val_indices), 'int32') filter_shapes = [] pool_sizes = [] filter_w = img_w for filter_h in filter_hs: filter_shapes.append((num_feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) layer0_input = shared_words[x.flatten()].reshape((x.shape[0], 1, x.shape[1], shared_words.shape[1])) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs), hidden_out_len, relu) mc = matcher0.output # mention contexts unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x') batch_entity_vecs = shared_entity_vecs[entities] matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu) entity_reps = matcher1.output # entity_reps = batch_entity_vecs unit_entity_reps = entity_reps / T.sqrt(T.maximum(T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x') similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1))) loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum() # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2) # / mc_norm params = matcher0.params + matcher1.params # params = matcher0.params for conv_layer in conv_layers: params += conv_layer.params grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim) index = T.lscalar() # test_model = theano.function( # [index], # error_rate, # givens={x: test_contexts[index * batch_size: (index + 1) * batch_size], # es: test_indices[index * batch_size: (index + 1) * batch_size]} # ) val_model = theano.function( [index], correct_rate, givens={x: val_contexts[index * batch_size: (index + 1) * batch_size], entities: val_indices[index * batch_size: (index + 1) * batch_size]} ) train_contexts = theano.shared( value=np.zeros((3, 2)), borrow=True) int_train_contexts = T.cast(train_contexts, 'int32') train_indices = theano.shared( value=np.zeros((3, 2)), borrow=True) int_train_indices = T.cast(train_indices, 'int32') train_model = theano.function( [index], loss, updates=grad_updates, givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} ) fdebug = theano.function( [index], similarities, givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} ) # print fdebug(0) val_perfs = [val_model(i) for i in xrange(num_val_batches)] print('init val perf %f' % np.mean(val_perfs)) print 'training ...' f_train = open(train_data_file_name, 'rb') epoch = 0 while epoch < n_epochs: epoch += 1 train_part_cnt = 0 # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part( # f_train, wid_idx_dict, 50000) cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, training_part_size, wid_idx_dict, sentence_len, sentence_pad_len) while not len(cur_train_contexts) == 0: train_contexts.set_value(cur_train_contexts, borrow=True) train_indices.set_value(cur_train_indices, borrow=True) # print fdebug(0) train_part_cnt += 1 num_train_batches = len(cur_train_contexts) / batch_size # print 'num_train_batches', num_train_batches mean_loss = 0 for minibatch_index in xrange(num_train_batches): cur_loss = train_model(minibatch_index) mean_loss += cur_loss # if (minibatch_index + 1) % (num_train_batches / 3) == 0: # show some progress # print minibatch_index, num_train_batches print 'loss:', mean_loss / num_train_batches # print fdebug(0) val_perfs = [val_model(i) for i in xrange(num_val_batches)] val_perf = np.mean(val_perfs) print('epoch %i, training part %i, val perf %f' % (epoch, train_part_cnt, val_perf)) cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, training_part_size, wid_idx_dict, sentence_len, sentence_pad_len) # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part( # f_train, wid_idx_dict, 50000) f_train.close()
def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, hidden2out_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}): self.embedding_dim = embedding_dim self.num_hidden_layers = num_hidden_layers self.hidden_dim = hidden_dim self.in_dropout_p = in_dropout_p self.hidden_dropout_p = hidden_dropout_p self.hidden2out_dropout_p = hidden2out_dropout_p self.update_hyperparameters = update_hyperparams print >> sys.stderr, 'Building computation graph for discriminator...' self.input_var = T.matrix('input') self.target_var = T.matrix('targer') self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in') self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, self.in_dropout_p) self.l_prehid = lasagne.layers.batch_norm( lasagne.layers.DenseLayer( self.l_in_dr, num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name='l_prehid')) self.layers = [self.l_in, self.l_in_dr, self.l_prehid] for i in xrange(self.num_hidden_layers): l_hid_predr = lasagne.layers.DropoutLayer(self.layers[-1], self.hidden_dropout_p) l_hid = lasagne.layers.batch_norm( lasagne.layers.DenseLayer( l_hid_predr, num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i))) l_hid_sum = lasagne.layers.ElemwiseSumLayer( [self.layers[-1], l_hid]) self.layers.append(l_hid_predr) self.layers.append(l_hid) self.layers.append(l_hid_sum) self.l_preout_predr = lasagne.layers.DropoutLayer( self.layers[-1], self.hidden2out_dropout_p) self.l_preout = lasagne.layers.batch_norm( lasagne.layers.DenseLayer(self.l_preout_predr, num_units=1, nonlinearity=None, name='l_preout')) self.l_out = lasagne.layers.NonlinearityLayer( self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out') self.prediction = lasagne.layers.get_output(self.l_out) self.loss = lasagne.objectives.binary_crossentropy( self.prediction, self.target_var).mean() self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean() self.params = lasagne.layers.get_all_params(self.l_out, trainable=True) self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams) print >> sys.stderr, 'Compiling discriminator...' self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates) self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
def create_structure(self): """Creates the symbolic graph of this layer. The input is always 3-dimensional: the first dimension is the time step, the second dimension are the sequences, and the third dimension is the word projection. When generating text, there's just one sequence and one time step in the input. Sets self.output to a symbolic matrix that describes the output of this layer. Assumes that the shared variables have been passed using ``set_params()``. """ layer_input = tensor.concatenate([x.output for x in self.input_layers], axis=2) preact = self._tensor_preact(layer_input, 'input') # Combine the first two dimensions so that softmax is taken # independently for each location, over the output classes. This # produces probabilities for the whole vocabulary. num_time_steps = preact.shape[0] num_sequences = preact.shape[1] output_size = preact.shape[2] preact = preact.reshape([num_time_steps * num_sequences, output_size]) self.output_probs = tensor.nnet.softmax(preact) self.output_probs = self.output_probs.reshape( [num_time_steps, num_sequences, output_size]) if self.network.mode.is_distribution(): return # We should predict probabilities of the target outputs, i.e. the words # at the next time step. if self.network.mode.is_target_words(): output_probs = self.output_probs target_class_ids = self.network.target_class_ids else: output_probs = self.output_probs[:-1] target_class_ids = self.network.class_input[1:] num_time_steps -= 1 assert_op = tensor.opt.Assert( "Mismatch in mini-batch and target classes shape.") target_class_ids = assert_op( target_class_ids, tensor.eq(target_class_ids.shape[0], output_probs.shape[0])) target_class_ids = assert_op( target_class_ids, tensor.eq(target_class_ids.shape[1], output_probs.shape[1])) # An index to a flattened input matrix times the vocabulary size can be # used to index the same location in the output matrix. The class ID is # added to index the probability of that word. output_probs = output_probs.flatten() target_class_ids = target_class_ids.flatten() minibatch_size = target_class_ids.shape[0] num_classes = self.network.vocabulary.num_classes() output_probs = assert_op( output_probs, tensor.eq(output_probs.shape[0], minibatch_size * num_classes)) target_indices = tensor.arange(minibatch_size) * num_classes target_indices += target_class_ids self.target_probs = output_probs[target_indices] self.target_probs = self.target_probs.reshape( [num_time_steps, num_sequences])
def accuracy(self, y): "Return the accuracy for the mini-batch." return T.mean(T.eq(y, self.y_out))
lasagne.layers.set_all_param_values(net['prob'], params) n_batches_per_epoch = np.floor(n_training_samples/float(BATCH_SIZE)) n_test_batches = np.floor(n_val_samples/float(BATCH_SIZE)) x_sym = T.tensor4() y_sym = T.ivector() l2_loss = lasagne.regularization.regularize_network_params(net['prob'], lasagne.regularization.l2) * 5e-4 prediction_train = lasagne.layers.get_output(net['prob'], x_sym, deterministic=False) loss = lasagne.objectives.categorical_crossentropy(prediction_train, y_sym) loss = loss.mean() loss += l2_loss acc_train = T.mean(T.eq(T.argmax(prediction_train, axis=1), y_sym), dtype=theano.config.floatX) prediction_test = lasagne.layers.get_output(net['prob'], x_sym, deterministic=True) loss_val = lasagne.objectives.categorical_crossentropy(prediction_test, y_sym) loss_val = loss_val.mean() loss_val += l2_loss acc = T.mean(T.eq(T.argmax(prediction_test, axis=1), y_sym), dtype=theano.config.floatX) params = lasagne.layers.get_all_params(net['prob'], trainable=True) learning_rate = theano.shared(np.float32(0.001)) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate) train_fn = theano.function([x_sym, y_sym], [loss, acc_train], updates=updates) val_fn = theano.function([x_sym, y_sym], [loss_val, acc]) pred_fn = theano.function([x_sym], prediction_test)
def main(num_epochs=100): # Load the dataset print("Loading data...") datasets = load_data() X_train, y_train = datasets[0] X_val, y_val = datasets[1] X_test, y_test = datasets[2] # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') learnrate = 0.01 # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") network = build_cnn(input_var) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): l2_penalty = regularize_layer_params(network, l2) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() + 0.1 * l2_penalty # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learnrate, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 20, shuffle=False): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 20, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 20, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: np.savez('model2.npz', *lasagne.layers.get_all_param_values(network))
def compile_theano_functions(self, data_type='2D', loss='cross_entropy'): assert self.net != None ### symbolic theano input theano_args = OrderedDict() dim = len(self.cf.dim) if data_type == '2D': assert dim == 2 theano_args['X'] = T.tensor4() theano_args['y'] = T.tensor4() theano_args['c'] = T.ivector() self.logger.info('Net: Working with 2D data.') val_args = deepcopy(theano_args) train_args = deepcopy(theano_args) train_args['lr'] = T.scalar(name='lr') train_args['lw'] = T.scalar(name='lw') ### class prediction functions class_layer = self.net[self.cf.class_layer] class_train_prediction = get_output(class_layer, train_args['X'], deterministic=False) class_val_prediction = get_output(class_layer, val_args['X'], deterministic=True) attention_val_prediction = get_output( self.net[self.cf.attention_layer], val_args['X'], deterministic=True) self.class_predict_smax['train'] = theano.function( [train_args['X']], class_train_prediction) self.class_predict_smax['val'] = theano.function( [val_args['X']], class_val_prediction) self.attention_predict = theano.function([val_args['X']], attention_val_prediction) # get flattened softmax prediction of shape (pixels, classes), where pixels = b*0*1 prediction_train_smax_flat = get_output( self.net[self.cf.seg_out_layer_flat], train_args['X'], deterministic=False) prediction_val_smax_flat = get_output( self.net[self.cf.seg_out_layer_flat], val_args['X'], deterministic=True) # reshape softmax prediction: shapes (pixels,c) -> (b,c,0,1) prediction_train_smax = prediction_train_smax_flat.reshape( (train_args['X'].shape[0], self.cf.dim[0], self.cf.dim[1], self.cf.num_classes)).transpose((0, 3, 1, 2)) prediction_val_smax = prediction_val_smax_flat.reshape( (val_args['X'].shape[0], self.cf.dim[0], self.cf.dim[1], self.cf.num_classes)).transpose((0, 3, 1, 2)) self.predict_smax['train'] = theano.function([train_args['X']], prediction_train_smax) self.predict_smax['val'] = theano.function([val_args['X']], prediction_val_smax) # reshape target vector: shapes (b,c,0,1) -> (b*0*1,c) flat_target_train = train_args['y'].transpose( (0, 2, 3, 1)).reshape((-1, self.cf.num_classes)) flat_target_val = val_args['y'].transpose((0, 2, 3, 1)).reshape( (-1, self.cf.num_classes)) elif data_type == '3D': assert dim == 3 theano_args['X'] = T.tensor5() theano_args['y'] = T.tensor5() theano_args['c'] = T.ivector() self.logger.info('Net: Working with 3D data.') val_args = deepcopy(theano_args) train_args = deepcopy(theano_args) train_args['lr'] = T.scalar(name='lr') ### prediction functions # get flattened softmax prediction of shape (pixels, classes), where pixels = b*0*1*2 prediction_train_smax_flat = get_output( self.net[self.cf.seg_out_layer_flat], train_args['X'], deterministic=False) prediction_val_smax_flat = get_output( self.net[self.cf.seg_out_layer_flat], val_args['X'], deterministic=True) # reshape softmax prediction: shapes (pixels,c) -> (b,c,0,1,2) prediction_train_smax = prediction_train_smax_flat.reshape( (train_args['X'].shape[0], self.cf.dim[0], self.cf.dim[1], self.cf.dim[2], self.cf.num_classes)).transpose( (0, 4, 1, 2, 3)) prediction_val_smax = prediction_val_smax_flat.reshape( (val_args['X'].shape[0], self.cf.dim[0], self.cf.dim[1], self.cf.dim[2], self.cf.num_classes)).transpose( (0, 4, 1, 2, 3)) self.predict_smax['train'] = theano.function([train_args['X']], prediction_train_smax) self.predict_smax['val'] = theano.function([val_args['X']], prediction_val_smax) # reshape target vector: shapes (b,c,0,1,2) -> (b*0*1*2,c) flat_target_train = train_args['y'].transpose( (0, 2, 3, 4, 1)).reshape((-1, self.cf.num_classes)) flat_target_val = val_args['y'].transpose((0, 2, 3, 4, 1)).reshape( (-1, self.cf.num_classes)) pred_train_one_hot = get_one_hot_prediction(prediction_train_smax, self.cf.num_classes) pred_val_one_hot = get_one_hot_prediction(prediction_val_smax, self.cf.num_classes) self.predict_one_hot['val'] = theano.function([val_args['X']], pred_val_one_hot) self.predict_one_hot['train'] = theano.function([train_args['X']], pred_train_one_hot) prediction_val = T.argmax(prediction_val_smax, axis=1) prediction_train = T.argmax(prediction_train_smax, axis=1) self.predict['val'] = theano.function([val_args['X']], prediction_val) self.predict['train'] = theano.function([train_args['X']], prediction_train) ### evaluation metrics train_dices_hard = binary_dice_per_instance_and_class( pred_train_one_hot, train_args['y'], dim) val_dices_hard = binary_dice_per_instance_and_class( pred_val_one_hot, val_args['y'], dim) train_dices_soft = binary_dice_per_instance_and_class( prediction_train_smax, train_args['y'], dim) val_dices_soft = binary_dice_per_instance_and_class( prediction_val_smax, val_args['y'], dim) class_train_acc = T.mean(T.eq(T.argmax(class_train_prediction, axis=1), train_args['c']), dtype=theano.config.floatX) class_val_acc = T.mean(T.eq(T.argmax(class_val_prediction, axis=1), val_args['c']), dtype=theano.config.floatX) ### loss types if loss == 'cross_entropy': self.loss['train'] = categorical_crossentropy( prediction_train_smax_flat, flat_target_train).mean() self.loss['val'] = categorical_crossentropy( prediction_val_smax_flat, flat_target_val).mean() if loss == 'weighted_cross_entropy': theano_args['w'] = T.fvector() train_args['w'] = T.fvector() train_loss = categorical_crossentropy(prediction_train_smax_flat, flat_target_train) train_loss *= train_args['w'] self.loss['train'] = train_loss.mean() val_args['w'] = T.fvector() val_loss = categorical_crossentropy(prediction_val_smax_flat, flat_target_val) val_loss *= val_args['w'] self.loss['val'] = val_loss.mean() if loss == 'dice': self.loss['train'] = 1 - train_dices_soft.mean() self.loss['val'] = 1 - val_dices_soft.mean() self.logger.info('Net: Using {} loss.'.format(loss)) if self.cf.use_weight_decay: training_loss = self.loss['train'] + \ self.cf.weight_decay * lasagne.regularization.regularize_network_params( self.net[self.cf.seg_out_layer_flat], lasagne.regularization.l2) self.logger.info('Net: Using weight decay of {}.'.format( self.cf.weight_decay)) else: training_loss = self.loss['train'] class_reg = lasagne.regularization.regularize_network_params( class_layer, lasagne.regularization.l2, {'trainable': True}) self.class_loss['train'] = lasagne.objectives.categorical_crossentropy( class_train_prediction, train_args['c']).mean() self.class_loss['val'] = lasagne.objectives.categorical_crossentropy( class_val_prediction, val_args['c']).mean() training_loss += ( self.class_loss['train'] + self.cf.class_weight_decay * class_reg) * train_args['lw'] ### training functions params = set( get_all_params(self.net[self.cf.class_layer], trainable=True)) params = params.union( set( get_all_params(self.net[self.cf.seg_out_layer_flat], trainable=True))) params = list(params) grads = theano.grad(training_loss, params) updates = adam(grads, params, learning_rate=train_args['lr']) self.train_fn = theano.function(train_args.values(), [ self.loss['train'], train_dices_hard, class_train_acc, self.class_loss['train'], training_loss ], updates=updates) self.val_fn = theano.function(val_args.values(), [ self.loss['val'], val_dices_hard, class_val_acc, self.class_loss['val'] ]) self.logger.info('Net: Compiled theano functions.')
def value_single(self, x, y, f): ret = T.mean([T.min([1. - y + f[2], 1.]), T.min([1. - f[2] + y, 1.])]) ret = T.cast(ret, dtype=theano.config.floatX) return T.cast(ifelse(T.eq(self.condition_single(x, f), 1.), ret, 1.), dtype=theano.config.floatX)
def main(dd): # load hyperparameters h, eta, grad_clip, len_sample, n_dir = read_hyp('hyp_{}'.format(dd)) # load model print("Load Network") load_l_in = lasagne.layers.InputLayer(shape=(None, len_sample, 2)) # slice the las step to extract label load_l_forward_1 = lasagne.layers.LSTMLayer( load_l_in, h, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) #load_l_forward_2 = lasagne.layers.LSTMLayer( # load_l_forward_1, h, grad_clipping=grad_clip, # nonlinearity=lasagne.nonlinearities.tanh) load_l_forward_slice = lasagne.layers.SliceLayer(load_l_forward_1, -1, 1) load_l_out = lasagne.layers.DenseLayer( load_l_forward_slice, num_units=n_dir, W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) read_model_data(load_l_out, 'model_{}'.format(dd)) target_values = T.ivector('target_output') network_output = lasagne.layers.get_output(load_l_out) network_act = lasagne.layers.get_output(load_l_forward_1) cost = T.nnet.categorical_crossentropy(network_output, target_values).mean() acc = T.mean(T.eq(T.argmax(network_output, axis=1), target_values), dtype=theano.config.floatX) compute_cost = theano.function([load_l_in.input_var, target_values], [cost, acc, network_output, network_act], allow_input_downcast=True) # test perm = np.random.permutation(len(data_x)) perm_data_x = data_x[perm[:n_test]] perm_data_y = data_y[perm[:n_test]] y_test = np.zeros(n_test) x_test = np.zeros((n_test, len_sample, 2)) for i in range(n_test): y_test[i] = perm_data_y[i] - 1 # mmmm... x_test[i] = perm_data_x[i][0:len_sample] # test cost_test, acc_test, output_test, network_act_test = compute_cost( x_test, y_test) savemat('outputs_raw.mat', { 'output': output_test, 'input': x_test, 'labels': y_test }) dump_results((output_test, y_test, x_test), dd) print("Final test cost = {}, acc = {}".format(cost_test, acc_test))
# Define loss function and metrics, and get an updates dictionary X_sym = T.tensor4() y_sym = T.ivector() # We'll connect our output classifier to the last fully connected layer of the network net['new_output'] = DenseLayer(net['pool5'], num_units=8, nonlinearity=softmax, W=lasagne.init.Normal(0.01)) prediction = lasagne.layers.get_output(net['new_output'], X_sym) loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym) loss = loss.mean() acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym), dtype=theano.config.floatX) learning_rate = theano.shared(np.array(0.001, dtype=theano.config.floatX)) learning_rate_decay = np.array(0.3, dtype=theano.config.floatX) updates = OrderedDict() print("Setting learning rates...") for name, layer in net.items(): print(name) layer_params = layer.get_params(trainable=True) if name in ['new_output', 'fc1000']: layer_lr = learning_rate else: layer_lr = learning_rate / 10 if name != 'fc1000':
def condition_single(self, x, f): return T.cast(T.eq(f[0], 1.), dtype=theano.config.floatX)
def main(): input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) target_var = T.ivector(name='targets') layer_input = lasagne.layers.InputLayer(shape=(None, LENGTH, 1), input_var=input_var, name='input') layer_rnn = RecurrentLayer(layer_input, NUM_UNITS, nonlinearity=nonlinearities.tanh, only_return_final=True, W_in_to_hid=lasagne.init.Constant(1), W_hid_to_hid=lasagne.init.Constant(2), b=None, name='RNN') W = layer_rnn.W_hid_to_hid U = layer_rnn.W_in_to_hid output = lasagne.layers.get_output(layer_rnn) output = output.mean(axis=1) prediction = T.switch(T.gt(output, 0), 1, -1) acc = T.eq(prediction, target_var) acc = acc.sum() # get the output before activation function tanh epsilon = 1e-6 prob = 0.5 * T.log((1 + output + epsilon) / (1 - output + epsilon)) prob = nonlinearities.sigmoid(prob) loss = -0.5 * ((1 + target_var) * T.log(prob) + (1 - target_var) * T.log(1 - prob)) loss = loss.sum() batch_size = 100 learning_rate = 0.01 steps_per_epoch = 1000 params = lasagne.layers.get_all_params(layer_rnn, trainable=True) updates = lasagne.updates.sgd(loss, params=params, learning_rate=learning_rate) train_fn = theano.function([input_var, target_var], [loss, acc, W, U, output], updates=updates) for epoch in range(10000): print 'Epoch %d (learning rate=%.4f)' % (epoch, learning_rate) loss = 0.0 correct = 0.0 num_back = 0 for step in range(steps_per_epoch): x, y = get_batch(batch_size) err, corr, w, u, pred = train_fn(x, y) # print x # print y # print pred loss += err correct += corr num_inst = (step + 1) * batch_size # update log sys.stdout.write("\b" * num_back) log_info = 'inst: %d loss: %.4f, corr: %d, acc: %.2f%%, W: %.6f, U: %.6f' % ( num_inst, loss / num_inst, correct, correct * 100 / num_inst, w.sum(), u.sum()) sys.stdout.write(log_info) num_back = len(log_info) # raw_input() # update training log after each epoch sys.stdout.write("\b" * num_back) assert num_inst == batch_size * steps_per_epoch print 'inst: %d loss: %.4f, corr: %d, acc: %.2f%%' % ( num_inst, loss / num_inst, correct, correct * 100 / num_inst)
def logp(self, value): mu = self.mu log_prob = bound( logpow(mu, value) - factln(value) - mu, mu >= 0, value >= 0) # Return zero when mu and value are both zero return tt.switch(1 * tt.eq(mu, 0) * tt.eq(value, 0), 0, log_prob)