def filter_boxes(boxes, min_size): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 # keep = np.where((ws >= min_size) & (hs >= min_size))[0] keep = (T.ge(ws, min_size) & T.ge(hs, min_size)).nonzero()[0] return keep
def matrix_noise3d(input_vectors, perm, grad3, vertex_table): skew_factors = (input_vectors[:, 0] + input_vectors[:, 1] + input_vectors[:, 2]) * 1.0 / 3.0 skewed_vectors = T.floor(input_vectors + skew_factors[:, np.newaxis]) unskew_factors = (skewed_vectors[:, 0] + skewed_vectors[:, 1] + skewed_vectors[:, 2]) * 1.0 / 6.0 offsets_0 = input_vectors - (skewed_vectors - unskew_factors[:, np.newaxis]) vertex_table_x_index = T.ge(offsets_0[:, 0], offsets_0[:, 1]) vertex_table_y_index = T.ge(offsets_0[:, 1], offsets_0[:, 2]) vertex_table_z_index = T.ge(offsets_0[:, 0], offsets_0[:, 2]) simplex_vertices = vertex_table[ vertex_table_x_index, vertex_table_y_index, vertex_table_z_index].reshape((input_vectors.shape[0], 2, 3)) offsets_1 = offsets_0 - simplex_vertices[:, 0] + 1.0 / 6.0 offsets_2 = offsets_0 - simplex_vertices[:, 1] + 1.0 / 3.0 offsets_3 = offsets_0 - 0.5 masked_skewed_vectors = T.bitwise_and(skewed_vectors.astype('int32'), 255) gi0s = perm[masked_skewed_vectors[:, 0] + perm[ masked_skewed_vectors[:, 1] + perm[ masked_skewed_vectors[:, 2]].astype('int32')].astype('int32')] % 12 gi1s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 0, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 0, 1] + perm[ masked_skewed_vectors[:, 2] + simplex_vertices[:, 0, 2]].astype('int32')].astype('int32')] % 12 gi2s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 1, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 1, 1] + perm[ masked_skewed_vectors[:, 2] + simplex_vertices[:, 1, 2]].astype('int32')].astype('int32')] % 12 gi3s = perm[masked_skewed_vectors[:, 0] + 1 + perm[ masked_skewed_vectors[:, 1] + 1 + perm[ masked_skewed_vectors[:, 2] + 1].astype('int32')].astype('int32')] % 12 n0s = calculate_gradient_contribution(offsets_0, gi0s, grad3) n1s = calculate_gradient_contribution(offsets_1, gi1s, grad3) n2s = calculate_gradient_contribution(offsets_2, gi2s, grad3) n3s = calculate_gradient_contribution(offsets_3, gi3s, grad3) return 23.0 * (n0s + n1s + n2s + n3s)
def __init__(self, random_state=None, low=0.0, high=1.0): super(Uniform, self).__init__(low=low, high=high, random_state=random_state, optimizer=None) # pdf self.pdf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), 0., 1. / (self.high - self.low)).ravel() self.make_(self.pdf_, "pdf") # -log pdf self.nnlf_ = T.switch( T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)), np.inf, T.log(self.high - self.low)).ravel() self.make_(self.nnlf_, "nnlf") # cdf self.cdf_ = T.switch( T.lt(self.X, self.low), 0., T.switch( T.lt(self.X, self.high), (self.X - self.low) / (self.high - self.low), 1.)).ravel() self.make_(self.cdf_, "cdf") # ppf self.ppf_ = self.p * (self.high - self.low) + self.low self.make_(self.ppf_, "ppf", args=[self.p])
def _step_test(self, x_t, xi_t, xf_t, xo_t, xc_t, mask_tm1, pred1_tm1, pred2_tm1, pred3_tm1, pred4_tm1, h_tm1, c_tm1, ctx_tm1, u_i, u_f, u_o, u_c, x_encoder, attention_encoder, x_img, B_W, B_U, B_Wimg, B_Wctx): outer1 = pred1_tm1[:, :, np.newaxis] * pred2_tm1[:, np.newaxis, :] outer1 = outer1.reshape((outer1.shape[0],-1)) outer2 = pred3_tm1[:, :, np.newaxis] * pred4_tm1[:, np.newaxis, :] outer2 = outer2.reshape((outer2.shape[0],-1)) pred = outer1[:, :, np.newaxis] * outer2[:, np.newaxis, :] pred = pred.reshape((pred.shape[0],-1)) x_t = self.W_embedding[T.argmax(pred, axis = 1)] * B_W[4] h_mask_tm1 = mask_tm1 * h_tm1 c_mask_tm1 = mask_tm1 * c_tm1 attention_x = T.dot(x_t, self.W_x2a) attention_total = attention_x[:,None,:] + attention_encoder if self.prev_context: attention_prev = T.dot(ctx_tm1,self.W_ctx2a) attention_total += attention_prev[:,None,:] attention_activation = T.dot( T.tanh(attention_total), self.V) # attention -> scores attention_alpha = T.nnet.softmax(attention_activation[:,:,0]) # scores -> weights ctx_t = (x_encoder * attention_alpha[:,:,None]).sum(axis = 1) # weighted average of context vectors xi_t = T.dot(x_t * B_W[0], self.W_i) + self.b_i + T.dot(x_img * B_Wimg[0], self.Wimg_i) + T.dot(ctx_t * B_Wctx[0], self.Wctx_i) xf_t = T.dot(x_t * B_W[1], self.W_f) + self.b_f + T.dot(x_img * B_Wimg[1], self.Wimg_f) + T.dot(ctx_t * B_Wctx[1], self.Wctx_f) xc_t = T.dot(x_t * B_W[2], self.W_c) + self.b_c + T.dot(x_img * B_Wimg[2], self.Wimg_c) + T.dot(ctx_t * B_Wctx[2], self.Wctx_c) xo_t = T.dot(x_t * B_W[3], self.W_o) + self.b_o + T.dot(x_img * B_Wimg[3], self.Wimg_o) + T.dot(ctx_t * B_Wctx[3], self.Wctx_o) i_t = self.inner_activation(xi_t + T.dot(h_mask_tm1 * B_U[0], u_i)) f_t = self.inner_activation(xf_t + T.dot(h_mask_tm1 * B_U[1], u_f)) c_t = f_t * c_mask_tm1 + i_t * self.activation(xc_t + T.dot(h_mask_tm1 * B_U[2], u_c)) o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1 * B_U[3], u_o)) h_t = o_t * self.activation(c_t) pred1_t = T.dot(h_t, self.U_p1) + self.b_p1 pred1_t = T.nnet.softmax(pred1_t.reshape((-1, pred1_t.shape[-1]))).reshape(pred1_t.shape) pred2_t = T.dot(h_t, self.U_p2) + self.b_p2 pred2_t = T.nnet.softmax(pred2_t.reshape((-1, pred2_t.shape[-1]))).reshape(pred2_t.shape) pred3_t = T.dot(h_t, self.U_p3) + self.b_p3 pred3_t = T.nnet.softmax(pred3_t.reshape((-1, pred3_t.shape[-1]))).reshape(pred3_t.shape) pred4_t = T.dot(h_t, self.U_p4) + self.b_p4 pred4_t = T.nnet.softmax(pred4_t.reshape((-1, pred4_t.shape[-1]))).reshape(pred4_t.shape) pred1_t = T.ge(pred1_t, T.max(pred1_t, axis = 1).reshape((pred1_t.shape[0],1)))*1.0 pred2_t = T.ge(pred2_t, T.max(pred2_t, axis = 1).reshape((pred2_t.shape[0],1)))*1.0 pred3_t = T.ge(pred3_t, T.max(pred3_t, axis = 1).reshape((pred3_t.shape[0],1)))*1.0 pred4_t = T.ge(pred4_t, T.max(pred4_t, axis = 1).reshape((pred4_t.shape[0],1)))*1.0 return pred1_t, pred2_t, pred3_t, pred4_t, h_t, c_t, ctx_t
def innerL_(sS, i): Ei = calcEk_(sS, i) # use "+" instead of "or" and "*" instead of "and" checkUselessAlpha1 = T.ge(sS.labels[i] * Ei, -sS.tol) + T.ge(sS.alphas[i], sS.C) checkUselessAlpha2 = T.le(sS.labels[i]*Ei, sS.tol) + T.lt(sS.alphas[i], 0) isUselessAlpha = toTheanoBool(checkUselessAlpha1 * checkUselessAlpha2) updateL = innerL_alphaInRange_(sS, i, Ei) earlyret = sS.retlist(0) return ifelse(isUselessAlpha, earlyret, updateL)
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost = cost, wrt = param) for param in params] sgrads = [T.grad(cost = cost, wrt = sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap>0: norm=T.cast(T.sqrt(T.sum([T.sum([T.sum(g**2) for g in g_list]) for g_list in grads]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in g_list] for g_list in grads] sgrads = [T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in sgrads] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(self.learning_rate) * g for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) if self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) if self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) if self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, - delta) return updates
def compute_nonlinearity_derivative(lin, bias): n_h = bias.shape[0] lin_re = lin[:, :n_h] lin_im = lin[:, n_h:] mod = T.sqrt(lin_re**2 + lin_im**2) ind = T.ge(mod + bias.dimshuffle('x', 0), 0) opt1 = 1. opt2 = 1. / (1 - mod - bias.dimshuffle('x', 0))**2 ind = T.ge(mod, 1) dnonlindlin = T.tile(ind * opt1 + (1-ind) * opt2, [1, 2]) return dnonlindlin
def cubicBSpline(self, L): b = T.zeros_like(L) idx4 = T.ge(L, 0) * T.lt(L, 1) idx3 = T.ge(L, 1) * T.lt(L, 2) idx2 = T.ge(L, 2) * T.lt(L, 3) idx1 = T.ge(L, 3) * T.le(L, 4) b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b) b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b) b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2) + 4) / 6, b) b = T.switch(T.eq(idx1, 1), (- T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b) return b.T # b is K x K' and thus, as we multiply from the right with
def _decode_step(self, seq, regs): left, right, target = seq[0], seq[1], seq[2] left_is_not_token = T.ge(left, 0) right_is_not_token = T.ge(right, 0) rep = regs[target] left_dec, right_dec = self._decode_computation(rep) regs = ifelse(left_is_not_token, T.set_subtensor(regs[left], left_dec), regs) regs = ifelse(right_is_not_token, T.set_subtensor(regs[right], right_dec), regs) return rep, left_dec, right_dec, regs
def __init__(self, input, nfeatures, C): """ Initialize the parameters of the SVM input: theano.tensor.TensorType symbolic variable that describes the input of the architecture (one minibatch) nfeatures: number of input units, the dimension of the space in which the datapoints lie C: error penalty """ self.nfeatures = nfeatures Wzeros, bzero = self.GetZeroWeights() #create a column vector with nfeatures rows self.W = theano.shared(value=Wzeros, name='W', borrow=True) # initialize bias: a scalar of the same data type as W self.b = theano.shared(bzero, name='b')#, borrow=True) # initialize the error penalty C self.C = C # hyperplane projection used in classification # T.dot(input,self.W) creates a vector of shape (rows,) == (# in minibatch,) # adding +self.b broadcasts the bias, adding it to each row, so the result is still of shape (rows,) self.hplaneproject = T.dot(input, self.W) + self.b # symbolic description of how to compute prediction as -1 or 1 # the function sign() is not in Theano, # so I use (x>0)*2-1 using T.ge() which returns 1 when true and 0 when false self.y_pred = T.ge(self.hplaneproject, 0)*2 - 1
def clip_grad(grads, norm, grad_clip): # clip the grads, when over a threshold _grads = [] for g in grads: _grads.append( TT.switch(TT.ge(norm, grad_clip), g*grad_clip/norm, g) ) return _grads
def huber_loss(y_hat, target, delta=1, center=0, std=1): l1_diff = abs((target - center - y_hat) / std) huber_loss = TT.switch(TT.ge(l1_diff, delta), (2*l1_diff - 1) * delta, l1_diff**2) return huber_loss
def _apply_hard_constraint_on_gradients(self, gradients, threshold=5, l_norm=2): """ Function to apply a hard constraint on the parameter's gradients. :param gradients: theano.tensor Symbolic representation of the parameter's gradients. :param threshold: int The threshold to which apply the constraints. Defaults to 5 (i.e., if the norm exceeds 5, the constraint is applied. :param l_norm: int The number of the norm to compute. Defaults to 2 (i.e., L2-norm). :return: gradients: theano.tensor Symbolic representation of the parameter's gradients with/without the constraint applied. """ for g in gradients: # for all gradients g /= self.batch_size # divide it by the size of the minibatch s = g.norm(l_norm) # compute its norm if T.ge(s, threshold): # if the norm is greater than the threshold g = (threshold * g) / s # replace gradient return gradients
def get_gradients(self, model, data, ** kwargs): cost = self.expr(model=model, data=data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) if self.gradient_clipping: norm_gs = 0. for grad in gradients.values(): norm_gs += (grad ** 2).sum() not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) norm_gs = T.sqrt(norm_gs) norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude), self.max_magnitude / norm_gs, 1.) for param, grad in gradients.items(): gradients[param] = T.switch(not_finite, .1 * param, grad * norm_gs) updates = OrderedDict() return gradients, updates
def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}): self.embedding_dim = embedding_dim self.num_hidden_layers = num_hidden_layers self.hidden_dim = hidden_dim self.in_dropout_p = in_dropout_p self.hidden_dropout_p = update_hyperparams print >> sys.stderr, 'Building computation graph for discriminator...' self.input_var = T.matrix('input') self.target_var = T.matrix('targer') self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in') self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, 0.2) self.layers = [self.l_in, self.l_in_dr] for i in xrange(self.num_hidden_layers): l_hid = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i))) l_hid_dr = lasagne.layers.DropoutLayer(l_hid, 0.5) self.layers.append(l_hid) self.layers.append(l_hid_dr) self.l_preout = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=1, nonlinearity=None, name='l_preout')) self.l_out = lasagne.layers.NonlinearityLayer(self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out') self.prediction = lasagne.layers.get_output(self.l_out) self.loss = lasagne.objectives.binary_crossentropy(self.prediction, self.target_var).mean() self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean() self.params = lasagne.layers.get_all_params(self.l_out, trainable=True) self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams) print >> sys.stderr, 'Compiling discriminator...' self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates) self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
def Adagrad(tparams, cost, inps, lr, epsilon=1e-6,clip_norm=5): """ default: lr=0.01 """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, g in zip(tparams.values(), gshared): acc = theano.shared(p.get_value() * 0.) acc_t = acc + g ** 2 updates.append((acc, acc_t)) p_t = p - (lr / tensor.sqrt(acc_t + epsilon)) * g updates.append((p, p_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def pq_theano(y_true, y_pred): """ Theano implementation of Pass Quality function. :param y_true: :param y_pred: :return: """ y_pred = tt.ge(y_pred, tt.mean(y_pred)).T[-1].T y_true = y_true.T[-1].T tt_diffs = tt.extra_ops.diff(y_true + y_pred) tt_r = theano.shared(0., 'r') tt_height = theano.shared(0., 'h') tt_error = theano.shared(0., 'err') tt_current_error = theano.shared(0., 'c_err') tt_flag = theano.shared(0., 'flag') values, updates = scan(fn=one_step, sequences=[tt_diffs, tt.abs_(tt_diffs)], outputs_info=[tt_error, tt_r, tt_height, tt_current_error, tt_flag]) epsilon = 0.0000000001 tt_ret = (1 - (values[1][-1] + epsilon) / (values[1][-1] + values[0][-1] + epsilon)) return tt_ret
def uniq_with_lengths(seq, time_mask): """ :param seq: (time,batch) -> label :param time_mask: (time,batch) -> 0 or 1 :return: out_seqs, seq_lens. out_seqs is (max_seq_len,batch) -> label, where max_seq_len <= time. seq_lens is (batch,) -> len. """ num_batches = seq.shape[1] diffs = T.ones_like(seq) diffs = T.set_subtensor(diffs[1:], seq[1:] - seq[:-1]) time_range = T.arange(seq.shape[0]).dimshuffle([0] + ['x'] * (seq.ndim - 1)) idx = T.switch(T.neq(diffs, 0) * time_mask, time_range, -1) # (time,batch) -> idx or -1 seq_lens = T.sum(T.ge(idx, 0), axis=0) # (batch,) -> len max_seq_len = T.max(seq_lens) # I don't know any better way without scan. # http://stackoverflow.com/questions/31379971/uniq-for-2d-theano-tensor def step(batch_idx, out_seq_b1): #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0] out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()] return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0],), dtype=seq.dtype))) out_seqs, _ = theano.scan( step, sequences=[T.arange(num_batches)], outputs_info=[T.zeros((max_seq_len,), dtype=seq.dtype)] ) # out_seqs is (batch,max_seq_len) return out_seqs.T, seq_lens
def pq_theano_f(y_true, y_pred): y_pred = tt.ge(y_pred, tt.mean(y_pred)) y_true = y_true tt_diffs = tt.extra_ops.diff(y_true + y_pred) # tt_r = tt.shape_padleft(theano.shared(0., 'r')) tt_r = theano.shared(0., 'r') # tt_height = tt.shape_padleft(theano.shared(0., 'h')) tt_height = theano.shared(0., 'h') # tt_error = tt.shape_padleft(theano.shared(0., 'err',)) tt_error = theano.shared(0., 'err') # tt_current_error = tt.shape_padleft(theano.shared(0., 'c_err')) tt_current_error = theano.shared(0., 'c_err') # tt_ret = theano.tensor.col('ret') tt_flag = theano.shared(0., 'flag') values, updates = scan(fn=one_step, sequences=[tt_diffs, tt.abs_(tt_diffs)], outputs_info=[tt_error, tt_r, tt_height, tt_current_error, tt_flag]) # print values[0].type epsilon = 0.0000000001 # print tt.ones_like(values[0]).type # print values[1].type tt_ret = 1 - (values[1] + epsilon) / (values[1] + values[0] + epsilon) return tt_ret
def theano_digitize(x, bins): """ Equivalent to numpy digitize. Parameters ---------- x : Theano tensor or array_like The array or matrix to be digitized bins : array_like The bins with which x should be digitized Returns ------- A Theano tensor The indices of the bins to which each value in input array belongs. """ binned = T.zeros_like(x) + len(bins) for i in range(len(bins)): bin=bins[i] if i == 0: binned=T.switch(T.lt(x,bin),i,binned) else: ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin)) binned=T.switch(ineq,i,binned) binned=T.switch(T.isnan(x), len(bins), binned) return binned
def logp_loss3(self, x, y, fake_label,neg_label, pos_ratio = 0.5): #adopt maxout for negative # pos_rati0 means pos examples weight (0.5 means equal 1:1) print "adopt positives weight ............. "+str(pos_ratio) y = y.dimshuffle((1,0)) inx = x.dimshuffle((1,0)) fake_mask = T.neq(y, fake_label) y = y*fake_mask pos_mask = T.and_(fake_mask, T.le(y, neg_label-1))*pos_ratio neg_mask = T.ge(y, neg_label)*(1- pos_ratio) pos_score, neg_score = self.structure2(inx,False) maxneg = T.max(neg_score, axis = -1) scores = T.concatenate((pos_score, maxneg.dimshuffle((0,1,'x'))), axis = 2) d3shape = scores.shape #seq*batch , label scores = scores.reshape((d3shape[0]*d3shape[1], d3shape[2])) pro = T.nnet.softmax(scores) _logp = T.nnet.categorical_crossentropy(pro, y.flatten()) _logp = _logp.reshape(fake_mask.shape) loss = (T.sum(_logp*pos_mask)+ T.sum(_logp*neg_mask))/ (T.sum(pos_mask)+T.sum(neg_mask)) pos_loss = T.sum(_logp*pos_mask) neg_loss = T.sum(_logp*neg_mask) return loss, pos_loss, neg_loss
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8): """ default: lr=0.0002 """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g ** 2) for g in grads])) if tensor.ge(norm, 5): grads = [g * 5 / norm for g in grads] gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] i = theano.shared(numpy_floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - b1 ** (i_t) fix2 = 1.0 - b2 ** (i_t) lr_t = lr * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.0) v = theano.shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def __init__(self, input, n_in, n_out, discriminant_threshold): # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = theano.shared( value=numpy.zeros( (n_in, n_out), dtype=theano.config.floatX ), name='W', borrow=True ) # initialize the basis b as a vector of n_out 0s self.b = theano.shared( value=numpy.zeros( (n_out,), dtype=theano.config.floatX ), name='b', borrow=True ) self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) # edited to reject events below a threshold #self.y_pred = T.argmax(self.p_y_given_x, axis=1) #self.y_pred = T.and_(T.argmax(self.p_y_given_x, axis=1), T.ge(self.p_y_given_x[:,1], -1))#discriminant_threshold)) self.y_pred = T.ge(self.p_y_given_x[:,1], discriminant_threshold)#discriminant_threshold)) # parameters of the model self.params = [self.W, self.b]
def Adadelta(tparams, cost, inps, lr, rho=0.95, epsilon=1e-6,clip_norm=5): """ default: lr=0.5 """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, g in zip(tparams.values(), gshared): acc = theano.shared(p.get_value() * 0.) acc_delta = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 updates.append((acc,acc_new)) update = g * tensor.sqrt(acc_delta + epsilon) / tensor.sqrt(acc_new + epsilon) updated_p = p - lr * update updates.append((p, updated_p)) acc_delta_new = rho * acc_delta + (1 - rho) * update ** 2 updates.append((acc_delta,acc_delta_new)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def RMSprop_v1(tparams, cost, inps, lr, rho=0.9, epsilon=1e-6,clip_norm=5): """ default: lr=0.001 This is the implementation of the RMSprop algorithm used in http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf. """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, g in zip(tparams.values(), gshared): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 updates.append((acc, acc_new)) updated_p = p - lr * (g / tensor.sqrt(acc_new + epsilon)) updates.append((p, updated_p)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0) i = shared(floatX(0.0)) i_t = i + 1.0 fix1 = 1.0 - (1.0 - b1) ** i_t fix2 = 1.0 - (1.0 - b2) ** i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.0) v = shared(p.get_value() * 0.0) m_t = (b1 * g) + ((1.0 - b1) * m) v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) # e_t = shared(p.get_value() * 0.) # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t # p_t = p_t + de_t # updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
def NAG(tparams, cost, inps, lr, momentum=0.9,clip_norm=5): """ default: lr=0.01 """ grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) m_new = momentum * m - lr * g updates.append((m, m_new)) updated_p = p + momentum * m_new - lr * g updates.append((p, updated_p)) f_update = theano.function([lr], [], updates=updates) return f_grad_shared, f_update
def rmax(x): xmax = T.ge(x, T.max(x, axis = 1).reshape((x.shape[0],1))) shift = (T.ones_like(x) - xmax) * x max2 = T.max(shift,axis = 1).reshape((x.shape[0],1)) out = T.nnet.relu(x - max2) return out
def dist(value): return switch (ge(value , 0) & gt(alpha , 0) & gt(beta , 0) & ge(n , value), gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta)+ gammaln(n+1)- gammaln(value+1)- gammaln(n-value +1) + gammaln(alpha+value)+ gammaln(n+beta-value)- gammaln(beta+alpha+n), -inf)
def dist(value): return switch(ge(value , 0) & gt(alpha , 0) & gt(beta , 0), -gammaln(alpha) + alpha*log(beta) - beta*value + switch(alpha != 1.0, (alpha - 1.0)*log(value), 0), -inf)
def dist(value): return switch(ge(p , 0) & le(p , 1), switch(value, log(p), log(1-p)), -inf)
def __init__(self, model, state, data): """ :type model: groundhog model class :param model: class depicting the model to be optimized :type state: dictionary or jobman DD object :param state: dictionary containing various hyper-parameters. The class will write into this dictionary updates like the current training error and so on :type data: groundhog dataset object :param data: data iterator over which training is done """ ##################################### # Step 0. Constructs shared variables ##################################### bs = state['bs'] self.model = model self.rng = numpy.random.RandomState(state['seed']) srng = RandomStreams(self.rng.randint(213)) self.gs = [ theano.shared(numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX), name=p.name) for p in model.params ] self.step = 0 self.bs = bs self.state = state self.data = data self.step_timer = time.time() self.gdata = [ theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype), name=x.name) for x in model.inputs ] if 'profile' not in self.state: self.state['profile'] = 0 ################################### # Step 1. Compile training function ################################### print 'Constructing grad function' loc_data = self.gdata lr = TT.scalar('lr') self.prop_exprs = [x[1] for x in model.properties] self.prop_names = [x[0] for x in model.properties] self.update_rules = [x[1] for x in model.updates] rval = theano.clone(model.param_grads + self.update_rules + \ self.prop_exprs + [model.train_cost], replace=zip(model.inputs, loc_data)) nparams = len(model.params) nouts = len(self.prop_exprs) nrules = len(self.update_rules) gs = rval[:nparams] rules = rval[nparams:nparams + nrules] outs = rval[nparams + nrules:] # Clip the momentum-applied gradient moment_gs = [s * state['moment'] + g for s, g in zip(self.gs, gs)] norm_gs = TT.sqrt( sum( TT.sum(x**2) for x, p in zip(moment_gs, self.model.params) if p not in self.model.exclude_params_for_norm)) if 'cutoff' in state and state['cutoff'] > 0: c = numpy.float32(state['cutoff']) if state['cutoff_rescale_length']: c = c * TT.cast(loc_data[0].shape[0], 'float32') notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs)) _gs = [] for g, p in zip(moment_gs, self.model.params): if p not in self.model.exclude_params_for_norm: tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g) _gs.append( TT.switch(notfinite, numpy.float32(.1) * p, tmpg)) else: _gs.append(g) gs = _gs store_gs = [(s, g) for s, g in zip(self.gs, gs)] updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)] print 'Compiling grad function' st = time.time() self.train_fn = theano.function([], outs, name='train_function', updates=updates, givens=zip(model.inputs, loc_data), profile=self.state['profile']) print 'took', time.time() - st self.lr = numpy.float32(state['lr']) new_params = [ p - s * lr * g for s, p, g in zip(model.params_grad_scale, model.params, self.gs) ] self.update_fn = theano.function([lr], [], name='update_function', allow_input_downcast=True, updates=zip(model.params, new_params), profile=self.state['profile']) self.old_cost = 1e20 self.schedules = model.get_schedules() self.return_names = self.prop_names + \ ['cost', 'time_step', 'whole_time', 'lr']
d = we_it.embedding_dim input_var = T.matrix('input') target_var = T.matrix('targer') l_in = lasagne.layers.InputLayer(shape=(None, d), input_var=input_var) l_hid1 = lasagne.layers.DenseLayer(l_in, num_units=NUM_HIDDEN1, nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) l_out = lasagne.layers.DenseLayer(l_hid1, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid) prediction = lasagne.layers.get_output(l_out) loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean() accuracy = T.eq(T.ge(prediction, 0.5), target_var).mean() params = lasagne.layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=0.001) print >> sys.stderr, 'Compiling...' train_fn = theano.function([input_var, target_var], [loss, accuracy], updates=updates) X = np.zeros((2 * HALF_BATCH_SIZE, d), dtype=theano.config.floatX) target_mat = np.vstack( [np.zeros((HALF_BATCH_SIZE, 1)), np.ones((HALF_BATCH_SIZE, 1))]).astype(theano.config.floatX) def train_batch(batch_id=1, print_every_n=1):
def dist(value): return switch(ge(value , 0) & le(value , 1) & gt(alpha , 0) & gt(beta , 0), gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta) + (alpha- 1)*log(value) + (beta-1)*log(1-value), -inf)
def elu(X): return T.switch(T.ge(X, 0), X, T.exp(X) - 1.)
def __init__(self, model, state, data): """ Parameters: :param model: Class describing the model used. It should provide the computational graph to evaluate the model, and have a similar structure to classes on the models folder :param state: Dictionary containing the current state of your job. This includes configuration of the job, specifically the seed, the startign damping factor, batch size, etc. See main.py for details :param data: Class describing the dataset used by the model """ if 'adarho' not in state: state['adarho'] = 0.96 if 'adaeps' not in state: state['adaeps'] = 1e-6 ##################################### # Step 0. Constructs shared variables ##################################### bs = state['bs'] self.model = model self.rng = numpy.random.RandomState(state['seed']) srng = RandomStreams(self.rng.randint(213)) self.gs = [ theano.shared(numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX), name=p.name) for p in model.params ] self.gnorm2 = [ theano.shared(numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX), name=p.name + '_g2') for p in model.params ] self.dnorm2 = [ theano.shared(numpy.zeros(p.get_value(borrow=True).shape, dtype=theano.config.floatX), name=p.name + '_d2') for p in model.params ] self.step = 0 self.whole_time = 0.0 self.bs = bs self.state = state self.data = data self.step_timer = time.time() self.gdata = [ theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype), name=x.name) for x in model.inputs ] #training dataset stored in gpu. They are defined as shared variables from the #'inputs' variable in the encoder-decoder model. if 'profile' not in self.state: self.state['profile'] = 0 ################################### # Step 1. Compile training function ################################### logger.debug('Constructing grad function') loc_data = self.gdata self.prop_exprs = [x[1] for x in model.properties] self.prop_names = [x[0] for x in model.properties] self.update_rules = [x[1] for x in model.updates] rval = theano.clone(model.param_grads + self.update_rules + \ self.prop_exprs + [model.train_cost], replace={k:v for k, v in zip(model.inputs, loc_data)}) nparams = len(model.params) nouts = len(self.prop_exprs) nrules = len(self.update_rules) gs = rval[:nparams] rules = rval[nparams:nparams + nrules] outs = rval[nparams + nrules:] norm_gs = TT.sqrt( sum( TT.sum(x**2) for x, p in zip(gs, self.model.params) if p not in self.model.exclude_params_for_norm)) if 'cutoff' in state and state['cutoff'] > 0: c = numpy.float32(state['cutoff']) if state['cutoff_rescale_length']: c = c * TT.cast(loc_data[0].shape[0], 'float32') notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs)) _gs = [] for g, p in zip(gs, self.model.params): if p not in self.model.exclude_params_for_norm: tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g) _gs.append( TT.switch(notfinite, numpy.float32(.1) * p, tmpg)) else: _gs.append(g) gs = _gs store_gs = [(s, g) for s, g in zip(self.gs, gs)] updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)] rho = self.state['adarho'] eps = self.state['adaeps'] # grad2 gnorm2_up = [ rho * gn2 + (1. - rho) * (g**2.) for gn2, g in zip(self.gnorm2, gs) ] updates = updates + [(gn1, gn2) for gn1, gn2 in zip(self.gnorm2, gnorm2_up)] logger.debug('Compiling grad function') st = time.time() self.train_fn = theano.function([], outs, name='train_function', updates=updates, givens=zip(model.inputs, loc_data)) logger.debug('took {}'.format(time.time() - st)) self.lr = numpy.float32(1.) new_params = [ p - (TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g for p, g, gn2, dn2 in zip(model.params, self.gs, self.gnorm2, self.dnorm2) ] updates = [(a, b) for a, b in zip(model.params, new_params)] # d2 d2_up = [(dn2, rho * dn2 + (1. - rho) * (((TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g)**2.)) for dn2, gn2, g in zip(self.dnorm2, self.gnorm2, self.gs)] updates = updates + d2_up self.update_fn = theano.function([], [], name='update_function', allow_input_downcast=True, updates=updates) self.old_cost = 1e20 self.schedules = model.get_schedules() self.return_names = self.prop_names + \ ['cost', 'error', 'time_step', 'whole_time', 'lr'] self.prev_batch = None
def SignTheano(x): return T.cast(2.*T.ge(x,0)-1., theano.config.floatX)
def build(self): # local graph context g_sym = T.imatrix('g') # a pair of node index (an edge) gy_sym = T.vector( 'gy') # label of a pair (indicating whether it is a false edge) l_g_in = lasagne.layers.InputLayer(shape=(None, 2), input_var=g_sym) # l_gy_in = lasagne.layers.InputLayer(shape=(None,), input_var=gy_sym) # embedding of node i (pivot node) l_emb_local_i = lasagne.layers.SliceLayer(l_g_in, indices=0, axis=1) l_emb_local_i = lasagne.layers.EmbeddingLayer( l_emb_local_i, input_size=self.num_nodes, output_size=self.embedding_dim) # embedding of node j (context node) l_emb_local_j = lasagne.layers.SliceLayer(l_g_in, indices=1, axis=1) l_emb_local_j = lasagne.layers.EmbeddingLayer( l_emb_local_j, input_size=self.num_nodes, output_size=self.embedding_dim) l_gy = lasagne.layers.ElemwiseMergeLayer( [l_emb_local_i, l_emb_local_j], T.mul) pgy_sym = lasagne.layers.get_output(l_gy) g_loss = -T.log(T.nnet.sigmoid(T.sum(pgy_sym, axis=1) * gy_sym)).sum() g_params = lasagne.layers.get_all_params(l_gy, trainable=True) g_updates = lasagne.updates.sgd(g_loss, g_params, learning_rate=self.g_learning_rate) self.graph_fn = theano.function([g_sym, gy_sym], g_loss, updates=g_updates, on_unused_input='warn') self.embedding = l_emb_local_i.W # local attributes ind_sym = T.ivector('ind') l_ind_in = lasagne.layers.InputLayer(shape=(None, ), input_var=ind_sym) # embedding of current node l_emb_f = lasagne.layers.EmbeddingLayer(l_ind_in, input_size=self.num_nodes, output_size=self.embedding_dim, W=self.embedding) x_sym = {} y_sym = T.vector('y') l_x_in = {} l_x_hid = {} attr_loss = {} for n in self.schema["nodes"]: x_sym[n] = sparse.csr_matrix(n, dtype='float32') l_x_in[n] = lasagne.layers.InputLayer( shape=(None, self.schema["nodes"][n]), input_var=x_sym[n]) l_x_hid[n] = layers.SparseLayer(l_x_in[n], self.embedding_dim) l_ay = lasagne.layers.ElemwiseMergeLayer([l_x_hid[n], l_emb_f], T.mul) pay_sym = lasagne.layers.get_output(l_ay) attr_loss[n] = -T.log( T.nnet.sigmoid(T.sum(pay_sym, axis=1) * y_sym)).sum() attr_params = lasagne.layers.get_all_params(l_ay, trainable=True) attr_updates = lasagne.updates.sgd( attr_loss[n], attr_params, learning_rate=self.g_learning_rate) self.attr_fn[n] = theano.function([x_sym[n], y_sym, ind_sym], attr_loss[n], updates=attr_updates, on_unused_input='warn') # alignment anchor_sym = T.imatrix('anchor') anchor_y_sym = T.vector('anchor_y') l_a_in = lasagne.layers.InputLayer(shape=(None, 2), input_var=anchor_sym) l_emb_anchor_i = lasagne.layers.SliceLayer(l_a_in, indices=0, axis=1) l_emb_anchor_i = lasagne.layers.EmbeddingLayer( l_emb_anchor_i, input_size=self.num_nodes, output_size=self.embedding_dim, W=self.embedding) l_emb_anchor_j = lasagne.layers.SliceLayer(l_a_in, indices=1, axis=1) l_emb_anchor_j = lasagne.layers.EmbeddingLayer( l_emb_anchor_j, input_size=self.num_nodes, output_size=self.embedding_dim, W=self.embedding) l_anchor_y = lasagne.layers.ElemwiseMergeLayer( [l_emb_anchor_i, l_emb_anchor_j], T.mul) p_anchor_y_sym = lasagne.layers.get_output(l_anchor_y) anchor_loss = -T.log( T.nnet.sigmoid( T.sum(p_anchor_y_sym, axis=1) * anchor_y_sym)).sum() anchor_params = lasagne.layers.get_all_params(l_anchor_y, trainable=True) anchor_updates = lasagne.updates.sgd( anchor_loss, anchor_params, learning_rate=self.g_learning_rate) self.anchor_fn = theano.function([anchor_sym, anchor_y_sym], anchor_loss, updates=anchor_updates, on_unused_input='warn') tp_anchor_y_sym = lasagne.layers.get_output(l_anchor_y, deterministic=True) tp_anchor_y_sym = T.sum(tp_anchor_y_sym, axis=1) acc = T.mean(T.eq(T.ge(tp_anchor_y_sym, 0), anchor_y_sym)) self.test_fn = theano.function([anchor_sym, anchor_y_sym], acc, on_unused_input='warn')
def __call__(self, p): p = theano.shared(p) p *= T.ge(p, 0.) return p
def SGMGNHT_2(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, resamp = 50, clip_norm=1): """ Additional parameters """ mom_tparams = OrderedDict() xi_tparams = OrderedDict() #rng = np.random.RandomState(3435) #+ rng.normal(0,1,p0.shape()) for k, p0 in tparams.iteritems(): mom_tparams[k] = theano.shared(p0.get_value() * 0. +1e-1, name='%s_mom'%k) xi_tparams[k] = theano.shared(p0.get_value() * 0. + 10.0, name='%s_xi'%k) #a = theano.shared(numpy_floatX(2.)) # m = theano.shared(numpy_floatX(1.)) # c = theano.shared(numpy_floatX(1.)) # sigma_p = theano.shared(numpy_floatX(10.)) # sigma_xi = theano.shared(numpy_floatX(0.01)) # sigma_theta = theano.shared(numpy_floatX(0.1)) # gamma = theano.shared(numpy_floatX(1.)) m = theano.shared(numpy_floatX(1.)) c = theano.shared(numpy_floatX(3.)) sigma_p = theano.shared(numpy_floatX(0.01)) sigma_mom = theano.shared(numpy_floatX(10.)) sigma_xi = theano.shared(numpy_floatX(0.01)) gamma = theano.shared(numpy_floatX(1.0)) logger = logging.getLogger('eval_ptb_sgmgnht') logger.setLevel(logging.INFO) fh = logging.FileHandler('eval_ptb_sgmgnht.log') logger.info('a = 1, m {} c {} s_p{} s_mom{} s_xi{} g_xi{}'.format( m.get_value(), c.get_value(), sigma_p.get_value(), sigma_mom.get_value(), sigma_xi.get_value(), gamma.get_value())) p = tensor.vector('p', dtype='float32') """ default: lr=0.001 """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) # clip norm norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) for k, p0 in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared): g_f = (tensor.sqrt(tensor.abs_(mom+1e-100)))/m K_f = g_f + 4/c/(1 + tensor.exp(c*g_f)) psi_f_1 = -1 + 2/( 1 + tensor.exp(-c*g_f)) f1_f_1 = 1/2.0/m**2 *psi_f_1**2 /g_f*tensor.sgn(mom) #f1_f_1 = 1/2.0/m*psi_f_1**2* tensor.abs_(mom+1e-100)**(-1/2) *tensor.sgn(mom) psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2 f3_f_1 = f1_f_1**2 - 1/2.0/m**2 * psi_f_1 * psi_grad_f_1 / tensor.abs_(mom) + 1/4.0/m * psi_f_1**2 * (tensor.abs_(mom+1e-100)**(-1.5)) # psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1) # f1_f = 1/2/m*psi_f**2 * (tensor.abs_(mom+1e-100)**(-1/2))*tensor.sgn(mom) # psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2 # f3_f = f1_f**2 - c/2/m**2 * psi_f * psi_grad_f / tensor.abs_(mom) + 1/4/m * psi_f**2 * (tensor.abs_(mom+1e-100)**(-3/2)) # temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f) # temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f) temp_f1 = f1_f_1 temp_f3 = f3_f_1 noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_mom = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2) noise_temp = tensor.zeros(p.get_value().shape) for aa in xrange(4): this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2) randmg = (noise_temp*m/2)**2*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')) updated_p = p + temp_f1 * lr - g * lr * ntrain * sigma_p + tensor.sqrt(2*sigma_p*lr) * noise_p updated_mom = (mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_mom*lr) * noise_mom)* (1-tensor.eq(tensor.mod(iterations,resamp),0)) + randmg * tensor.eq(tensor.mod(iterations,resamp),0) #updated_mom = mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p temp_xi = trng.normal(p.get_value().shape, avg = sigma_mom, std = tensor.sqrt(sigma_xi/2) , dtype='float32') updated_xi = (xi + temp_f3* gamma * lr - (xi - sigma_mom)*sigma_xi/(gamma+1e-10)*lr + tensor.sqrt(2*sigma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,resamp),resamp/2)) + temp_xi * tensor.eq(tensor.mod(iterations,resamp),resamp/2) updates.append((p, updated_p)) updates.append((mom, updated_mom)) updates.append((xi, updated_xi)) f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates) #f_params = theano.function([], [a, m, c, mom.shape]) return f_grad_shared, f_update
def clip_norm(g, c, n): if c > 0: g = T.switch(T.ge(n, c), g * c / n, g) return g
def error(self, y, threshold=0.5): return tensor.mean( tensor.eq(tensor.ge(self.prediction(), threshold), y))
def dist(value): return switch (ge(value , 0) & ge(n , value) & ge(p , 0) & le(p , 1), switch(ne(value , 0) , value*log(p), 0) + (n-value)*log(1-p) + factln(n)-factln(value)-factln(n-value), -inf)
def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)): """Return the shape of the output from this op, for input of given shape and flags. Parameters ---------- imgshape : tuple of integers or scalar Theano variables the shape of a tensor of images. The last two elements are interpreted as the number of rows, and the number of cols. ds : tuple of two ints downsample factor over rows and columns this parameter indicates the size of the pooling region st : tuple of two ints the stride size. This is the distance between the pooling regions. If it's set to None, in which case it equlas ds. ignore_border : bool if ds doesn't divide imgshape, do we include an extra row/col of partial downsampling (False) or ignore it (True). padding : tuple of two ints (pad_h, pad_w), pad zeros to extend beyond four borders of the images, pad_h is the size of the top and bottom margins, and pad_w is the size of the left and right margins. Returns ------- list : the shape of the output from this op, for input of given shape. This will have the same length as imgshape, but with last two elements reduced as per the downsampling & ignore_border flags. """ if len(imgshape) < 2: raise TypeError('imgshape must have at least two elements ' '(rows, cols)') if st is None: st = ds r, c = imgshape[-2:] r += padding[0] * 2 c += padding[1] * 2 if ignore_border: out_r = (r - ds[0]) // st[0] + 1 out_c = (c - ds[1]) // st[1] + 1 if isinstance(r, theano.Variable): nr = tensor.maximum(out_r, 0) else: nr = numpy.maximum(out_r, 0) if isinstance(c, theano.Variable): nc = tensor.maximum(out_c, 0) else: nc = numpy.maximum(out_c, 0) else: if isinstance(r, theano.Variable): nr = tensor.switch( tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1, tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1) elif st[0] >= ds[0]: nr = (r - 1) // st[0] + 1 else: nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1 if isinstance(c, theano.Variable): nc = tensor.switch( tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1, tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1) elif st[1] >= ds[1]: nc = (c - 1) // st[1] + 1 else: nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1 rval = list(imgshape[:-2]) + [nr, nc] return rval
def SGMGHMC_old(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, a_i = 2, clip_norm=5): """ Additional parameters """ mom_tparams = OrderedDict() xi_tparams = OrderedDict() for k, p0 in tparams.iteritems(): mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) a = theano.shared(numpy_floatX(1.)) m = theano.shared(numpy_floatX(1.)) c = theano.shared(numpy_floatX(5.)) sigma_p = theano.shared(numpy_floatX(10.)) sigma_xi = theano.shared(numpy_floatX(1.)) gamma_xi = theano.shared(numpy_floatX(0.001)) logger = logging.getLogger('eval_ptb_sgmgnht') logger.setLevel(logging.INFO) fh = logging.FileHandler('eval_ptb_sgmgnht.log') logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value())) p = tensor.vector('p', dtype='float32') """ default: lr=0.001 """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) for k, p0 in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared): g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a)) K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f))) psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) ) f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1)) psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2 f3_f_1 = 1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2) psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1) f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1)) psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2 f3_f = 1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2) temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f) temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f) noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2) noise_temp = tensor.zeros(p.get_value().shape) for aa in xrange(a_i*2): this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2) randmg = (noise_temp*m/2)**a*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')) updated_p = p + temp_f1 * lr updated_mom = (mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0) #updated_mom = mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p temp_xi = trng.normal(p.get_value().shape, avg = sigma_p, std = tensor.sqrt(sigma_xi/2) , dtype='float32') updated_xi = (xi + temp_f3* sigma_xi * lr - (xi - sigma_p)*gamma_xi*lr + tensor.sqrt(2*sigma_xi*gamma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,100),50)) + temp_xi * tensor.eq(tensor.mod(iterations,100),50) updates.append((p, updated_p)) updates.append((mom, updated_mom)) updates.append((xi, updated_xi)) f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates) #f_params = theano.function([], [a, m, c, mom.shape]) return f_grad_shared, f_update
def huber_loss(y_hat, target, delta=1, center=0, std=1): l1_diff = abs((target - center - y_hat) / std) huber_loss = TT.switch(TT.ge(l1_diff, delta), (2 * l1_diff - 1) * delta, l1_diff**2) return huber_loss
def snelu(X): scale = 1.0507009873554804934193349852946 alpha = 1.6732632423543772848170429916717 return scale * T.switch(T.ge(X, 0), X, alpha * T.exp(X) - alpha)
def apply(self, application_call, words, mask): """Compute the log-likelihood for a batch of sequences. words An integer matrix of shape (B, T), where T is the number of time step, B is the batch size. Note that this order of the axis is different from what all RNN bricks consume, hence and the axis should be transposed at some point. mask A float32 matrix of shape (B, T). Zeros indicate the padding. """ word_ids = self._word_to_id(words) # shortlisting input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk) output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk) application_call.add_auxiliary_variable( unk_ratio(input_word_ids, mask, self._vocab.unk), name='unk_ratio') # Run the main rnn with combined inputs rnn_inputs = self._main_lookup.apply(input_word_ids) encoder_rnn_states = self._encoder_rnn.apply( tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)), mask=mask.T)[0] # The first token is not predicted logits = self._pre_softmax.apply(main_rnn_states[:-1]) targets = output_word_ids.T[1:] out_softmax = self._softmax.apply(logits, extra_ndim=1) application_call.add_auxiliary_variable( out_softmax.copy(), name="proba_out") minus_logs = self._softmax.categorical_cross_entropy( targets, logits, extra_ndim=1) targets_mask = mask.T[1:] costs = self.add_perplexity_measure(application_call, minus_logs, targets_mask, "perplexity") missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L) self.add_perplexity_measure(application_call, minus_logs, targets_mask * missing_embs.T[:-1], "perplexity_after_mis_word_embs") self.add_perplexity_measure(application_call, minus_logs, targets_mask * (1-missing_embs.T[:-1]), "perplexity_after_word_embs") word_counts = self._word_to_count(words) very_rare_masks = [] for threshold in self._very_rare_threshold: very_rare_mask = tensor.lt(word_counts, threshold).astype('int32') very_rare_mask = targets_mask * (very_rare_mask.T[:-1]) very_rare_masks.append(very_rare_mask) self.add_perplexity_measure(application_call, minus_logs, very_rare_mask, "perplexity_after_very_rare_" + str(threshold)) if self._retrieval: has_def = tensor.zeros_like(output_word_ids) has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1) mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs) self.add_perplexity_measure(application_call, minus_logs, mask_targets_has_def, "perplexity_after_def_embs") for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks): self.add_perplexity_measure(application_call, minus_logs, very_rare_mask * mask_targets_has_def, "perplexity_after_def_very_rare_" + str(thresh)) application_call.add_auxiliary_variable( mask_targets_has_def.T, name='mask_def_emb') return costs, updates
def leaky(self, X): return T.switch(T.ge(X, 0), X, self.leak * X)
gsums = [ theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params ] cost = net.cost(y) + L2_REG * net.L2_sqr gparams = T.grad(cost, net.params) updates = OrderedDict() # Compute norm of gradients norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams])) # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011) for gparam, param, gsum in zip(gparams, net.params, gsums): gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD), gparam / norm * CLIPPING_THRESHOLD, gparam) # Clipping of gradients updates[gsum] = gsum + (gparam**2) updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6))) train_model = theano.function(inputs=[x, p, y, lr], outputs=cost, updates=updates) validate_model = theano.function(inputs=[x, p, y], outputs=net.cost(y)) print("Training...") for epoch in range(starting_epoch, MAX_EPOCHS): t0 = time() total_neg_log_likelihood = 0
def SGMGHMC_p(tparams, cost, inps, ntrain, lr, rho=0.9, epsilon=1e-6, clip_norm=0.1): """ Additional parameters """ mom_tparams = OrderedDict() xi_tparams = OrderedDict() for k, p0 in tparams.iteritems(): mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) a = theano.shared(numpy_floatX(2.)) m_org = theano.shared(numpy_floatX(5.)) c = theano.shared(numpy_floatX(5.)) sigma_p = theano.shared(numpy_floatX(10.)) sigma_xi = theano.shared(numpy_floatX(0.001)) gamma_xi = theano.shared(numpy_floatX(1)) logger = logging.getLogger('eval_ptb_sgmgnht') logger.setLevel(logging.INFO) fh = logging.FileHandler('eval_ptb_sgmgnht.log') logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m_org.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value())) p = tensor.vector('p', dtype='float32') """ default: lr=0.001 """ trng = RandomStreams(123) grads = tensor.grad(cost, tparams.values()) norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads])) if tensor.ge(norm, clip_norm): grads = [g*clip_norm/norm for g in grads] gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) for k, p0 in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inps, cost, updates=gsup) updates = [] # reset mom # counter = theano.shared(numpy_floatX(0.)) # updates.append((counter,counter+1)) for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared): #rms prop t = theano.shared(p.get_value() * 0.) t_new = rho * t + (1-rho) * g**2 updates.append((t, t_new)) m = (tensor.sqrt(t_new) + 1e-10) m = m/tensor.max(m)*m_org #m = tensor.switch(tensor.ge(m,1*m_org), 1*m_org, m) m = tensor.switch(tensor.le(m,m_org*0.01), m_org*0.01, m) g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a)) K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f))) psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) ) f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1)) psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2 f3_f_1 = 1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2) psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1) f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1)) psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2 f3_f = 1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2) temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f) temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f) noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32') #lr_new = 1 / tensor.sqrt(tensor.abs_(temp_f1)) * lr lr_new = lr updated_p = p + temp_f1 * lr_new #updated_mom = (mom - temp_f1* xi *lr - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0) updated_mom = mom - 1.2*temp_f1* xi *lr_new - g * lr_new * ntrain + tensor.sqrt(2*sigma_p*lr_new) * noise_p updated_xi = xi + temp_f3* sigma_xi * lr_new - (xi - sigma_p)*gamma_xi*lr_new + tensor.sqrt(2*sigma_xi*gamma_xi*lr_new) * noise_xi updates.append((p, updated_p)) updates.append((mom, updated_mom)) updates.append((xi, updated_xi)) f_update = theano.function([lr,ntrain], [p,mom,m], updates=updates) return f_grad_shared, f_update
def elu(self, X): return T.switch(T.ge(X, 0), X, self.elu_param * (T.exp(X) - 1))
def ge(x, y): return T.ge(x, y)
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6): grads = [T.grad(cost=cost, wrt=param) for param in params] sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params] updates = OrderedDict() if self.grad_cap > 0: norm = T.cast( T.sqrt( T.sum([ T.sum([T.sum(g**2) for g in g_list]) for g_list in grads ]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX) grads = [[ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list ] for g_list in grads] sgrads = [ T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads ] for p_list, g_list in zip(params, grads): for p, g in zip(p_list, g_list): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32( self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32( self.learning_rate) * g fgrads = [ T.grad(cost=cost, wrt=full_param) for full_param in full_params ] for p, g in zip(full_params, fgrads): if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(p, g, updates) if self.adapt == 'rmsprop': g = self.rmsprop(p, g, updates) if self.adapt == 'adadelta': g = self.adadelta(p, g, updates) if self.adapt == 'adam': g = self.adam(p, g, updates) if self.momentum > 0: velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True) velocity2 = self.momentum * velocity - np.float32( self.learning_rate) * (g + self.lmbd * p) updates[velocity] = velocity2 updates[p] = p + velocity2 else: updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32( self.learning_rate) * g ''' for i in range(len(sgrads)): g = sgrads[i] fullP = full_params[i] sample_idx = sidxs[i] sparam = sampled_params[i] if self.adapt: if self.adapt == 'adagrad': g = self.adagrad(fullP, g, updates, sample_idx) if self.adapt == 'rmsprop': g = self.rmsprop(fullP, g, updates, sample_idx) if self.adapt == 'adadelta': g = self.adadelta(fullP, g, updates, sample_idx) if self.adapt == 'adam': g = self.adam(fullP, g, updates, sample_idx) if self.lmbd > 0: delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam) else: delta = np.float32(self.learning_rate) * g if self.momentum > 0: velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True) vs = velocity[sample_idx] velocity2 = self.momentum * vs - delta updates[velocity] = T.set_subtensor(vs, velocity2) updates[fullP] = T.inc_subtensor(sparam, velocity2) else: updates[fullP] = T.inc_subtensor(sparam, - delta) ''' return updates
n_steps = tt.iscalar("generator/n_steps") tau = tt.fscalar("generator/gumbel/tau") # Generator's input variables for the Encoder v_gen_input = tt.itensor3(name="generator/input") # Generator's embedding subnetwork readout for the Encoder v_gen_embed = lasagne.layers.get_output(l_embed_char, v_gen_input) # Freeze the hidden inputs of the decoder layers, which do not tap into the encoder. for layer in dec_rnn_layers: GRULayer_freeze(layer, v_gen_input) # Readout the last state from the encoder. inputs = {l_encoder_embed: v_gen_embed, l_encoder_mask: tt.ge(v_gen_input, 0)} inputs[l_stack_aug_mask] = tt.gt(tt.sum(inputs[l_encoder_mask], axis=-1), 0) outputs = [l.hid_init for l in dec_rnn_layers] dec_hid_inits = lasagne.layers.get_output(outputs, inputs, deterministic=True) # Prepare the initial values fed into the scan loop of the Generator h_0 = tt.concatenate(dec_hid_inits, axis=-1) x_0 = tt.fill(tt.zeros((v_gen_input.shape[0],), dtype="int32"), vocab.index("\x02")) x_0 = lasagne.layers.get_output(l_embed_char, x_0) m_0 = tt.ones((v_gen_input.shape[0],), 'bool')
def __init__(self, shape, input_var=None, name=None, binary=True, deterministic=False, threshold=0.5, batch_size=100, n_bits=-1, **kwargs): self.rng_mrg = RandomStreams(lasagne.random.get_rng().randint( 1, 2394349593)) if binary == False: if n_bits == -1: # no quantization at all super(InputLayer, self).__init__(shape=shape, input_var=input_var, name=name, **kwargs) else: # Normalize to [0 ~ 1 - 2^(-n_bits)] input_var_normed = input_var * (1 - 2**(-n_bits)) if deterministic == False: shape_rand = list(shape) if shape_rand[0] is None: shape_rand[0] = batch_size shape_rand = tuple(shape_rand) input_var_ceil = T.ceil( input_var_normed * 2**n_bits) / 2**n_bits input_var_floor = T.floor( input_var_normed * 2**n_bits) / 2**n_bits input_var_above_floor = input_var - input_var_floor input_var_stochastic_quantized = T.cast( T.switch( T.ge( input_var_above_floor, self.rng_mrg.uniform( shape_rand, low=0.0, high=2**(-n_bits), dtype=theano.config.floatX)), input_var_ceil, input_var_floor), theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_stochastic_quantized, name=name, **kwargs) else: input_var_deterministic_quantized = T.cast( T.round(input_var_normed * 2**n_bits) / 2**n_bits, theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_deterministic_quantized, name=name, **kwargs) else: if deterministic == False: shape_rand = list(shape) if shape_rand[0] is None: shape_rand[0] = batch_size shape_rand = tuple(shape_rand) # Bernoulli spikes input_var_stochastic_binarized = T.cast( T.gt( input_var, self.rng_mrg.uniform(shape_rand, low=0.0, high=1.0, dtype=theano.config.floatX)), theano.config.floatX) super(InputLayer, self).__init__(shape=shape, input_var=input_var_stochastic_binarized, name=name, **kwargs) else: input_var_deterministic_binarized = T.cast( T.switch(T.ge(input_var, threshold), 1.0, 0.), theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_deterministic_binarized, name=name, **kwargs)
def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag, *args): #----------------------------------------------------------------- ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,... # The general iteration is similar to the case k = 1 with v0 = 0: # # p1 = Operator * v1 - beta1 * v0, # alpha1 = v1'p1, # q2 = p2 - alpha1 * v1, # beta2^2 = q2'q2, # v2 = (1/beta2) q2. # # Again, p = betak P vk, where P = C**(-1). # .... more description needed. #----------------------------------------------------------------- xs = args[0 * n_params:1 * n_params] r1s = args[1 * n_params:2 * n_params] r2s = args[2 * n_params:3 * n_params] r3s = args[3 * n_params:4 * n_params] dls = args[4 * n_params:5 * n_params] ds = args[5 * n_params:6 * n_params] betal = beta beta = betan vs = [r3 / beta for r3 in r3s] r3s, upds = compute_Av(*vs) r3s = [r3 - shift * v for r3, v in zip(r3s, vs)] r3s = [ TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1, r3) for r3, r1 in zip(r3s, r1s) ] alpha = inner_product(r3s, vs) r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)] r1s = [r2 for r2 in r2s] r2s = [r3 for r3 in r3s] if Ms is not None: r3s = [r3 / M for r3, M in zip(r3s, Ms)] betan = sqrt_inner_product(r2s, r3s) else: betan = sqrt_inner_product(r3s) pnorml = pnorm pnorm = TT.switch( TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta))) #----------------------------------------------------------------- ## Apply previous rotation Qk-1 to get # [dlta_k epln_{k+1}] = [cs sn][dbar_k 0 ] # [gbar_k dbar_{k+1} ] [sn -cs][alpha_k beta_{k+1}]. #----------------------------------------------------------------- dbar = dbarn epln = eplnn dlta = cs * dbar + sn * alpha gbar = sn * dbar - cs * alpha eplnn = sn * betan dbarn = -cs * betan ## Compute the current plane rotation Qk gammal2 = gammal gammal = gamma cs, sn, gamma = symGivens2(gbar, betan) tau = cs * phi phi = sn * phi Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau)) # Update d dl2s = [dl for dl in dls] dls = [d for d in ds] ds = [ TT.switch(TT.neq(gamma, constantX(0.)), (v - epln * dl2 - dlta * dl) / gamma, v) for v, dl2, dl in zip(vs, dl2s, dls) ] d_norm = TT.switch(TT.neq(gamma, constantX(0.)), sqrt_inner_product(ds), constantX(numpy.inf)) # Update x except if it will become too big xnorml = xnorm dl2s = [x for x in xs] xs = [x + tau * d for x, d in zip(xs, ds)] xnorm = sqrt_inner_product(xs) xs = [ TT.switch(TT.ge(xnorm, maxxnorm), dl2, x) for dl2, x in zip(dl2s, xs) ] flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag) # Estimate various norms rnorml = rnorm # ||r_{k-1}|| Anorml = Anorm Acondl = Acond relrnorml = relrnorm flag_no_6 = TT.neq(flag, constantX(6.)) Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)), Dnorm) xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm) rnorm = TT.switch(flag_no_6, phi, rnorm) relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm), relrnorm) Tnorm = TT.switch( flag_no_6, TT.switch( TT.eq(niter, constantX(0.)), TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)), TT.sqrt( TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) + TT.sqr(betan))), Tnorm) Anorm = TT.maximum(Anorm, pnorm) Acond = Anorm * Dnorm rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn)) Anorml = rnorml * rootl relArnorml = rootl / Anorm #--------------------------------------------------------------- # See if any of the stopping criteria are satisfied. # In rare cases, flag is already -1 from above (Abar = const*I). #--------------------------------------------------------------- epsx = Anorm * xnorm * eps epsr = Anorm * xnorm * rtol #Test for singular Hk (hence singular A) # or x is already an LS solution (so again A must be singular). t1 = constantX(1) + relrnorm t2 = constantX(1) + relArnorml flag = TT.switch( TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag, constantX(6))), multiple_switch(TT.le(t1, constantX(1)), constantX(3), TT.le(t2, constantX(1)), constantX(4), TT.le(relrnorm, rtol), constantX(1), TT.le(Anorm, constantX(1e-20)), constantX(12), TT.le(relArnorml, rtol), constantX(10), TT.ge(epsx, beta1), constantX(5), TT.ge(xnorm, maxxnorm), constantX(6), TT.ge(niter, TT.cast(maxit, theano.config.floatX)), constantX(8), flag), flag) flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.), flag) return [niter + constantX(1.), beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn, Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm, relrnorm, relArnorml, Anorm, flag] + xs + r1s + r2s + r3s + dls + ds, upds, \ theano.scan_module.scan_utils.until(TT.neq(flag, 0))
def clip_gradients(gradients, grad_clip=5., hard_clip=False): """ This returns the gradient parameters clipped according to the grad_clip value given in initialization. As described here: http://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/ Code mostly taken from https://github.com/kastnerkyle/minet/blob/master/minet/net.py Based on: Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012). Parameters ---------- gradients : dict A dictionary mapping from the model's parameters to their gradients. grad_clip : float, optional How much to clip gradients (if at all). hard_clip : bool Whether to use hard clipping (keeping gradients at grad_clip level), or soft clipping (rescaling based on grad_clip). Returns ------- clipgrads : dict A dictionary mapping from the model's parameters to their correctly clipped gradients. (If no self.grad_clip, this just returns the original `gradients` input parameter). """ if grad_clip: gradients = gradients.items() params = [item[0] for item in gradients] grads = [item[1] for item in gradients] # Gradient clipping grad_norm = T.sqrt(sum([T.sqr(grad).sum() for grad in grads])) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = grad_clip scaling_den = T.maximum(grad_clip, grad_norm) if hard_clip: # do the NaN/inf trick grads = [T.switch(not_finite, 0.1 * param, grad) for param, grad in gradients] # hard clip gradients above or below grad_clip to be = grad_clip grads = [T.switch(T.ge(grad_norm, grad_clip), T.sgn(grad) * grad_clip, grad) for grad in grads] else: # NaN/inf trick combined with scaling. grads = [T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) for param, grad in gradients] clipgrads = OrderedDict(zip(params, grads)) return clipgrads else: return gradients