コード例 #1
0
ファイル: bbox.py プロジェクト: smajida/faster_r_cnn
def filter_boxes(boxes, min_size):
    """Remove all boxes with any side smaller than min_size."""
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1
    # keep = np.where((ws >= min_size) & (hs >= min_size))[0]
    keep = (T.ge(ws, min_size) & T.ge(hs, min_size)).nonzero()[0]
    return keep
コード例 #2
0
def matrix_noise3d(input_vectors, perm, grad3, vertex_table):
    skew_factors = (input_vectors[:, 0] + input_vectors[:, 1] + input_vectors[:, 2]) * 1.0 / 3.0
    skewed_vectors = T.floor(input_vectors + skew_factors[:, np.newaxis])
    unskew_factors = (skewed_vectors[:, 0] + skewed_vectors[:, 1] + skewed_vectors[:, 2]) * 1.0 / 6.0
    offsets_0 = input_vectors - (skewed_vectors - unskew_factors[:, np.newaxis])
    vertex_table_x_index = T.ge(offsets_0[:, 0], offsets_0[:, 1])
    vertex_table_y_index = T.ge(offsets_0[:, 1], offsets_0[:, 2])
    vertex_table_z_index = T.ge(offsets_0[:, 0], offsets_0[:, 2])
    simplex_vertices = vertex_table[
        vertex_table_x_index,
        vertex_table_y_index,
        vertex_table_z_index].reshape((input_vectors.shape[0], 2, 3))
    offsets_1 = offsets_0 - simplex_vertices[:, 0] + 1.0 / 6.0
    offsets_2 = offsets_0 - simplex_vertices[:, 1] + 1.0 / 3.0
    offsets_3 = offsets_0 - 0.5
    masked_skewed_vectors = T.bitwise_and(skewed_vectors.astype('int32'), 255)
    gi0s = perm[masked_skewed_vectors[:, 0] + perm[
        masked_skewed_vectors[:, 1] + perm[
            masked_skewed_vectors[:, 2]].astype('int32')].astype('int32')] % 12
    gi1s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 0, 0] + perm[
        masked_skewed_vectors[:, 1] + simplex_vertices[:, 0, 1] + perm[
            masked_skewed_vectors[:, 2] + simplex_vertices[:, 0, 2]].astype('int32')].astype('int32')] % 12
    gi2s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 1, 0] + perm[
        masked_skewed_vectors[:, 1] + simplex_vertices[:, 1, 1] + perm[
            masked_skewed_vectors[:, 2] + simplex_vertices[:, 1, 2]].astype('int32')].astype('int32')] % 12
    gi3s = perm[masked_skewed_vectors[:, 0] + 1 + perm[
        masked_skewed_vectors[:, 1] + 1 + perm[
            masked_skewed_vectors[:, 2] + 1].astype('int32')].astype('int32')] % 12
    n0s = calculate_gradient_contribution(offsets_0, gi0s, grad3)
    n1s = calculate_gradient_contribution(offsets_1, gi1s, grad3)
    n2s = calculate_gradient_contribution(offsets_2, gi2s, grad3)
    n3s = calculate_gradient_contribution(offsets_3, gi3s, grad3)
    return 23.0 * (n0s + n1s + n2s + n3s)
コード例 #3
0
ファイル: uniform.py プロジェクト: ibab/carl
    def __init__(self, random_state=None, low=0.0, high=1.0):
        super(Uniform, self).__init__(low=low, high=high,
                                      random_state=random_state,
                                      optimizer=None)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self.make_(self.pdf_, "pdf")

        # -log pdf
        self.nnlf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self.make_(self.nnlf_, "nnlf")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self.make_(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self.make_(self.ppf_, "ppf", args=[self.p])
コード例 #4
0
ファイル: decoder.py プロジェクト: hongyuanzhu/keras
	def _step_test(self,
			  x_t, xi_t, xf_t, xo_t, xc_t, mask_tm1,
			  pred1_tm1, pred2_tm1, pred3_tm1, pred4_tm1, h_tm1, c_tm1, ctx_tm1, 
			  u_i, u_f, u_o, u_c, x_encoder, attention_encoder, x_img, B_W, B_U, B_Wimg, B_Wctx):

		outer1 = pred1_tm1[:, :, np.newaxis] * pred2_tm1[:, np.newaxis, :]
		outer1 =  outer1.reshape((outer1.shape[0],-1))
		outer2 = pred3_tm1[:, :, np.newaxis] * pred4_tm1[:, np.newaxis, :]
		outer2 =  outer2.reshape((outer2.shape[0],-1))
		pred = outer1[:, :, np.newaxis] * outer2[:, np.newaxis, :]
		pred =	pred.reshape((pred.shape[0],-1))
		x_t = self.W_embedding[T.argmax(pred, axis = 1)] * B_W[4]

		h_mask_tm1 = mask_tm1 * h_tm1
		c_mask_tm1 = mask_tm1 * c_tm1

		attention_x = T.dot(x_t, self.W_x2a)
		attention_total = attention_x[:,None,:] + attention_encoder
		if self.prev_context:
			attention_prev = T.dot(ctx_tm1,self.W_ctx2a)
			attention_total += attention_prev[:,None,:]

		attention_activation = T.dot( T.tanh(attention_total), self.V) # attention -> scores
		attention_alpha = T.nnet.softmax(attention_activation[:,:,0])  # scores -> weights
		ctx_t = (x_encoder * attention_alpha[:,:,None]).sum(axis = 1)  # weighted average of context vectors

		xi_t = T.dot(x_t * B_W[0], self.W_i) + self.b_i + T.dot(x_img * B_Wimg[0], self.Wimg_i) + T.dot(ctx_t * B_Wctx[0], self.Wctx_i)
		xf_t = T.dot(x_t * B_W[1], self.W_f) + self.b_f + T.dot(x_img * B_Wimg[1], self.Wimg_f) + T.dot(ctx_t * B_Wctx[1], self.Wctx_f)
		xc_t = T.dot(x_t * B_W[2], self.W_c) + self.b_c + T.dot(x_img * B_Wimg[2], self.Wimg_c) + T.dot(ctx_t * B_Wctx[2], self.Wctx_c)
		xo_t = T.dot(x_t * B_W[3], self.W_o) + self.b_o + T.dot(x_img * B_Wimg[3], self.Wimg_o) + T.dot(ctx_t * B_Wctx[3], self.Wctx_o)

		i_t = self.inner_activation(xi_t + T.dot(h_mask_tm1 * B_U[0], u_i))
		f_t = self.inner_activation(xf_t + T.dot(h_mask_tm1 * B_U[1], u_f))
		c_t = f_t * c_mask_tm1 + i_t * self.activation(xc_t + T.dot(h_mask_tm1 * B_U[2], u_c))
		o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1 * B_U[3], u_o))
		h_t = o_t * self.activation(c_t)

		pred1_t = T.dot(h_t, self.U_p1) + self.b_p1
		pred1_t = T.nnet.softmax(pred1_t.reshape((-1, pred1_t.shape[-1]))).reshape(pred1_t.shape)

		pred2_t = T.dot(h_t, self.U_p2) + self.b_p2
		pred2_t = T.nnet.softmax(pred2_t.reshape((-1, pred2_t.shape[-1]))).reshape(pred2_t.shape)

		pred3_t = T.dot(h_t, self.U_p3) + self.b_p3
		pred3_t = T.nnet.softmax(pred3_t.reshape((-1, pred3_t.shape[-1]))).reshape(pred3_t.shape)

		pred4_t = T.dot(h_t, self.U_p4) + self.b_p4
		pred4_t = T.nnet.softmax(pred4_t.reshape((-1, pred4_t.shape[-1]))).reshape(pred4_t.shape)

		pred1_t = T.ge(pred1_t, T.max(pred1_t, axis = 1).reshape((pred1_t.shape[0],1)))*1.0
		pred2_t = T.ge(pred2_t, T.max(pred2_t, axis = 1).reshape((pred2_t.shape[0],1)))*1.0
		pred3_t = T.ge(pred3_t, T.max(pred3_t, axis = 1).reshape((pred3_t.shape[0],1)))*1.0
		pred4_t = T.ge(pred4_t, T.max(pred4_t, axis = 1).reshape((pred4_t.shape[0],1)))*1.0

		return pred1_t, pred2_t, pred3_t, pred4_t, h_t, c_t, ctx_t
コード例 #5
0
ファイル: theanoSMO.py プロジェクト: martinmeinke/ipml
def innerL_(sS, i):
    Ei = calcEk_(sS, i)
    
    # use "+" instead of "or" and "*" instead of "and"
    checkUselessAlpha1 = T.ge(sS.labels[i] * Ei, -sS.tol) + T.ge(sS.alphas[i], sS.C)
    checkUselessAlpha2 = T.le(sS.labels[i]*Ei, sS.tol) + T.lt(sS.alphas[i], 0)
    isUselessAlpha = toTheanoBool(checkUselessAlpha1 * checkUselessAlpha2)
    
    updateL = innerL_alphaInRange_(sS, i, Ei)
    earlyret = sS.retlist(0)
    return ifelse(isUselessAlpha, earlyret, updateL)
コード例 #6
0
ファイル: gru4rec.py プロジェクト: marcromeyn/GRU4Rec
 def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6):
     grads =  [T.grad(cost = cost, wrt = param) for param in params]
     sgrads = [T.grad(cost = cost, wrt = sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap>0:
         norm=T.cast(T.sqrt(T.sum([T.sum([T.sum(g**2) for g in g_list]) for g_list in grads]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX)
         grads = [[T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in g_list] for g_list in grads]
         sgrads = [T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in sgrads]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt:
                 if self.adapt == 'adagrad':
                     g = self.adagrad(p, g, updates)
                 if self.adapt == 'rmsprop':
                     g = self.rmsprop(p, g, updates)
                 if self.adapt == 'adadelta':
                     g = self.adadelta(p, g, updates)
                 if self.adapt == 'adam':
                     g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt:
             if self.adapt == 'adagrad':
                 g = self.adagrad(fullP, g, updates, sample_idx)
             if self.adapt == 'rmsprop':
                 g = self.rmsprop(fullP, g, updates, sample_idx)
             if self.adapt == 'adadelta':
                 g = self.adadelta(fullP, g, updates, sample_idx)
             if self.adapt == 'adam':
                 g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, - delta)
     return updates
コード例 #7
0
    def compute_nonlinearity_derivative(lin, bias):
        n_h = bias.shape[0]
        lin_re = lin[:, :n_h]
        lin_im = lin[:, n_h:]        
        mod = T.sqrt(lin_re**2 + lin_im**2)

        ind = T.ge(mod + bias.dimshuffle('x', 0), 0)
        opt1 = 1.
        opt2 = 1. / (1 - mod - bias.dimshuffle('x', 0))**2
        ind = T.ge(mod, 1)
        dnonlindlin = T.tile(ind * opt1 + (1-ind) * opt2, [1, 2])         

        return dnonlindlin
コード例 #8
0
  def cubicBSpline(self, L):
    b = T.zeros_like(L)

    idx4 = T.ge(L, 0) * T.lt(L, 1)
    idx3 = T.ge(L, 1) * T.lt(L, 2)
    idx2 = T.ge(L, 2) * T.lt(L, 3)
    idx1 = T.ge(L, 3) * T.le(L, 4)

    b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b)
    b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b)
    b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2)           + 4) / 6, b)
    b = T.switch(T.eq(idx1, 1), (-  T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b)
    
    return b.T # b is K x K' and thus, as we multiply from the right with
コード例 #9
0
ファイル: rae.py プロジェクト: zomux/nlpy
    def _decode_step(self, seq, regs):
        left, right, target = seq[0], seq[1], seq[2]

        left_is_not_token = T.ge(left, 0)
        right_is_not_token = T.ge(right, 0)

        rep = regs[target]

        left_dec, right_dec = self._decode_computation(rep)

        regs = ifelse(left_is_not_token, T.set_subtensor(regs[left], left_dec), regs)
        regs = ifelse(right_is_not_token, T.set_subtensor(regs[right], right_dec), regs)

        return  rep, left_dec, right_dec, regs
コード例 #10
0
	def __init__(self, input, nfeatures, C):
		""" Initialize the parameters of the SVM
		
		input: theano.tensor.TensorType
			symbolic variable that describes the input of the architecture (one minibatch)
		
		nfeatures: number of input units, the dimension of the space in which the datapoints lie
		
		C: error penalty
		"""
		self.nfeatures = nfeatures
		Wzeros, bzero = self.GetZeroWeights()
		
		#create a column vector with nfeatures rows
		self.W = theano.shared(value=Wzeros, name='W', borrow=True)
		
		# initialize bias: a scalar of the same data type as W
		self.b = theano.shared(bzero, name='b')#, borrow=True)
		
		# initialize the error penalty C
		self.C = C
		
		# hyperplane projection used in classification
		# T.dot(input,self.W) creates a vector of shape (rows,) == (# in minibatch,)
		# adding +self.b broadcasts the bias, adding it to each row, so the result is still of shape (rows,)
		self.hplaneproject = T.dot(input, self.W) + self.b
		
		# symbolic description of how to compute prediction as -1 or 1
		# the function sign() is not in Theano,
		# so I use (x>0)*2-1 using T.ge() which returns 1 when true and 0 when false
		self.y_pred = T.ge(self.hplaneproject, 0)*2 - 1
コード例 #11
0
def clip_grad(grads, norm, grad_clip):
    # clip the grads, when over a threshold
    _grads = []
    for g in grads:
        _grads.append( TT.switch(TT.ge(norm, grad_clip), g*grad_clip/norm, g) )
    
    return _grads
コード例 #12
0
ファイル: costs.py プロジェクト: caglar/PentominoExps
def huber_loss(y_hat, target, delta=1, center=0, std=1):

    l1_diff = abs((target - center - y_hat) / std)
    huber_loss = TT.switch(TT.ge(l1_diff, delta),
                           (2*l1_diff - 1) * delta,
                           l1_diff**2)
    return huber_loss
コード例 #13
0
ファイル: models.py プロジェクト: guxiaodong1987/seq_to_seq
    def _apply_hard_constraint_on_gradients(self, gradients, threshold=5, l_norm=2):
        """
        Function to apply a hard constraint on the parameter's gradients.

        :param gradients: theano.tensor
            Symbolic representation of the  parameter's gradients.

        :param threshold: int
            The threshold to which apply the constraints. Defaults to 5 (i.e., if the norm exceeds
                5, the constraint is applied.

        :param l_norm: int
            The number of the norm to compute. Defaults to 2 (i.e., L2-norm).

        :return: gradients: theano.tensor
            Symbolic representation of the parameter's gradients with/without the constraint
                applied.

        """

        for g in gradients:  # for all gradients
            g /= self.batch_size  # divide it by the size of the minibatch
            s = g.norm(l_norm)  # compute its norm
            if T.ge(s, threshold):  # if the norm is greater than the threshold
                g = (threshold * g) / s  # replace gradient

        return gradients
コード例 #14
0
ファイル: rnn.py プロジェクト: Sandy4321/librnn
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
コード例 #15
0
	def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}):
		self.embedding_dim = embedding_dim
		self.num_hidden_layers = num_hidden_layers
		self.hidden_dim = hidden_dim
		self.in_dropout_p = in_dropout_p
		self.hidden_dropout_p = update_hyperparams
	
		print >> sys.stderr, 'Building computation graph for discriminator...'		
		self.input_var = T.matrix('input')
		self.target_var = T.matrix('targer')

		self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in')
		self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, 0.2)
		self.layers = [self.l_in, self.l_in_dr]
		for i in xrange(self.num_hidden_layers):
			l_hid = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i)))
			l_hid_dr = lasagne.layers.DropoutLayer(l_hid, 0.5)
			self.layers.append(l_hid)
			self.layers.append(l_hid_dr)
		self.l_preout = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=1, nonlinearity=None, name='l_preout'))
		self.l_out = lasagne.layers.NonlinearityLayer(self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out')

		self.prediction = lasagne.layers.get_output(self.l_out)
		self.loss = lasagne.objectives.binary_crossentropy(self.prediction, self.target_var).mean()
		self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean()

		self.params = lasagne.layers.get_all_params(self.l_out, trainable=True)
		self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams)

		print >> sys.stderr, 'Compiling discriminator...'
		self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates)
		self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
コード例 #16
0
def Adagrad(tparams, cost, inps, lr, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.01 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)    
    
    updates = []
    
    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_t = acc + g ** 2
        updates.append((acc, acc_t))
        p_t = p - (lr / tensor.sqrt(acc_t + epsilon)) * g
        updates.append((p, p_t))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
コード例 #17
0
ファイル: theano_pq.py プロジェクト: elejke/sensors
def pq_theano(y_true, y_pred):
    """
    Theano implementation of Pass Quality function.
    :param y_true:
    :param y_pred:
    :return:
    """
    y_pred = tt.ge(y_pred, tt.mean(y_pred)).T[-1].T
    y_true = y_true.T[-1].T

    tt_diffs = tt.extra_ops.diff(y_true + y_pred)

    tt_r = theano.shared(0., 'r')
    tt_height = theano.shared(0., 'h')
    tt_error = theano.shared(0., 'err')
    tt_current_error = theano.shared(0., 'c_err')
    tt_flag = theano.shared(0., 'flag')

    values, updates = scan(fn=one_step,
                           sequences=[tt_diffs, tt.abs_(tt_diffs)],
                           outputs_info=[tt_error,
                                         tt_r,
                                         tt_height,
                                         tt_current_error,
                                         tt_flag])

    epsilon = 0.0000000001

    tt_ret = (1 - (values[1][-1] + epsilon) / (values[1][-1] +
                                               values[0][-1] +
                                               epsilon))
    return tt_ret
コード例 #18
0
ファイル: NetworkCtcLayer.py プロジェクト: atuxhe/returnn
def uniq_with_lengths(seq, time_mask):
  """
  :param seq: (time,batch) -> label
  :param time_mask: (time,batch) -> 0 or 1
  :return: out_seqs, seq_lens.
  out_seqs is (max_seq_len,batch) -> label, where max_seq_len <= time.
  seq_lens is (batch,) -> len.
  """
  num_batches = seq.shape[1]
  diffs = T.ones_like(seq)
  diffs = T.set_subtensor(diffs[1:], seq[1:] - seq[:-1])
  time_range = T.arange(seq.shape[0]).dimshuffle([0] + ['x'] * (seq.ndim - 1))
  idx = T.switch(T.neq(diffs, 0) * time_mask, time_range, -1)  # (time,batch) -> idx or -1
  seq_lens = T.sum(T.ge(idx, 0), axis=0)  # (batch,) -> len
  max_seq_len = T.max(seq_lens)

  # I don't know any better way without scan.
  # http://stackoverflow.com/questions/31379971/uniq-for-2d-theano-tensor
  def step(batch_idx, out_seq_b1):
    #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0]
    out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()]
    return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0],), dtype=seq.dtype)))

  out_seqs, _ = theano.scan(
    step,
    sequences=[T.arange(num_batches)],
    outputs_info=[T.zeros((max_seq_len,), dtype=seq.dtype)]
  )
  # out_seqs is (batch,max_seq_len)
  return out_seqs.T, seq_lens
コード例 #19
0
ファイル: theano_pq.py プロジェクト: elejke/sensors
def pq_theano_f(y_true, y_pred):

    y_pred = tt.ge(y_pred, tt.mean(y_pred))
    y_true = y_true

    tt_diffs = tt.extra_ops.diff(y_true + y_pred)

    # tt_r = tt.shape_padleft(theano.shared(0., 'r'))
    tt_r = theano.shared(0., 'r')
    # tt_height = tt.shape_padleft(theano.shared(0., 'h'))
    tt_height = theano.shared(0., 'h')
    # tt_error = tt.shape_padleft(theano.shared(0., 'err',))
    tt_error = theano.shared(0., 'err')
    # tt_current_error = tt.shape_padleft(theano.shared(0., 'c_err'))
    tt_current_error = theano.shared(0., 'c_err')
    # tt_ret = theano.tensor.col('ret')
    tt_flag = theano.shared(0., 'flag')

    values, updates = scan(fn=one_step,
                           sequences=[tt_diffs, tt.abs_(tt_diffs)],
                           outputs_info=[tt_error,
                                         tt_r,
                                         tt_height,
                                         tt_current_error,
                                         tt_flag])

    # print values[0].type

    epsilon = 0.0000000001
    # print tt.ones_like(values[0]).type
    # print values[1].type
    tt_ret = 1 - (values[1] + epsilon) / (values[1] +
                                          values[0] +
                                          epsilon)
    return tt_ret
コード例 #20
0
ファイル: utils.py プロジェクト: eglxiang/xnn
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin=bins[i]
        if i == 0:
            binned=T.switch(T.lt(x,bin),i,binned)
        else:
            ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin))
            binned=T.switch(ineq,i,binned)
    binned=T.switch(T.isnan(x), len(bins), binned)
    return binned
コード例 #21
0
ファイル: token_model.py プロジェクト: mswellhao/active_NER
	def logp_loss3(self, x, y, fake_label,neg_label, pos_ratio = 0.5): #adopt maxout  for  negative   
		# pos_rati0  means  pos examples weight (0.5 means  equal 1:1)


		print "adopt  positives  weight  ............. "+str(pos_ratio)
		y = y.dimshuffle((1,0))
		inx = x.dimshuffle((1,0))
		fake_mask = T.neq(y, fake_label)
		y = y*fake_mask

		pos_mask = T.and_(fake_mask, T.le(y, neg_label-1))*pos_ratio
		neg_mask = T.ge(y, neg_label)*(1- pos_ratio)


		pos_score, neg_score = self.structure2(inx,False)
		maxneg = T.max(neg_score, axis = -1)

		scores = T.concatenate((pos_score, maxneg.dimshuffle((0,1,'x'))), axis = 2)

		d3shape = scores.shape

		#seq*batch , label
		scores = scores.reshape((d3shape[0]*d3shape[1],  d3shape[2]))
		pro = T.nnet.softmax(scores)

		_logp = T.nnet.categorical_crossentropy(pro, y.flatten())

		_logp = _logp.reshape(fake_mask.shape)

		loss = (T.sum(_logp*pos_mask)+ T.sum(_logp*neg_mask))/ (T.sum(pos_mask)+T.sum(neg_mask))
		pos_loss = T.sum(_logp*pos_mask)
		neg_loss = T.sum(_logp*neg_mask)


		return loss, pos_loss, neg_loss
コード例 #22
0
ファイル: toolbox.py プロジェクト: Weichern/Theano-Lights
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
コード例 #23
0
ファイル: dialog_encdec.py プロジェクト: npow/hed-dlg
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
コード例 #24
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8):
    """ default: lr=0.0002 """

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g ** 2) for g in grads]))
    if tensor.ge(norm, 5):
        grads = [g * 5 / norm for g in grads]

    gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - b1 ** (i_t)
    fix2 = 1.0 - b2 ** (i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.0)
        v = theano.shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
コード例 #25
0
    def __init__(self, input, n_in, n_out, discriminant_threshold):

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
              
        # initialize the basis b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
        
        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
        # edited to reject events below a threshold
        #self.y_pred = T.argmax(self.p_y_given_x, axis=1) 
        #self.y_pred = T.and_(T.argmax(self.p_y_given_x, axis=1), T.ge(self.p_y_given_x[:,1], -1))#discriminant_threshold))
        self.y_pred = T.ge(self.p_y_given_x[:,1], discriminant_threshold)#discriminant_threshold))

        # parameters of the model
        self.params = [self.W, self.b]
コード例 #26
0
def Adadelta(tparams, cost, inps, lr, rho=0.95, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.5 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_delta = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc,acc_new)) 
        
        update = g * tensor.sqrt(acc_delta + epsilon) / tensor.sqrt(acc_new + epsilon)
        updated_p = p - lr * update
        updates.append((p, updated_p))
        
        acc_delta_new = rho * acc_delta + (1 - rho) * update ** 2
        updates.append((acc_delta,acc_delta_new))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
コード例 #27
0
def RMSprop_v1(tparams, cost, inps, lr, rho=0.9, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.001 
        This is the implementation of the RMSprop algorithm used in
        http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf.
    """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc, acc_new))
        
        updated_p = p - lr * (g / tensor.sqrt(acc_new + epsilon))
        updates.append((p, updated_p))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update
コード例 #28
0
ファイル: toolbox.py プロジェクト: ronvohra/Theano-Lights
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0)

    i = shared(floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - (1.0 - b1) ** i_t
    fix2 = 1.0 - (1.0 - b2) ** i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.0)
        v = shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        # e_t = shared(p.get_value() * 0.)
        # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        # p_t = p_t + de_t
        # updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
コード例 #29
0
def NAG(tparams, cost, inps, lr, momentum=0.9,clip_norm=5):
    """ default: lr=0.01 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup) 
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        m_new = momentum * m - lr * g
        updates.append((m, m_new))        
        
        updated_p = p + momentum * m_new - lr * g
        updates.append((p, updated_p))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
コード例 #30
0
ファイル: activations.py プロジェクト: hongyuanzhu/keras
def rmax(x):

	xmax  = T.ge(x, T.max(x, axis = 1).reshape((x.shape[0],1)))
	shift = (T.ones_like(x) - xmax) * x
	max2  = T.max(shift,axis = 1).reshape((x.shape[0],1))
	out = T.nnet.relu(x - max2)

	return out
コード例 #31
0
ファイル: distributions.py プロジェクト: ESRogs/mcex
 def dist(value):
     return switch (ge(value , 0) & gt(alpha , 0) & gt(beta , 0) & ge(n , value), 
                gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta)+ gammaln(n+1)- gammaln(value+1)- gammaln(n-value +1) + gammaln(alpha+value)+ gammaln(n+beta-value)- gammaln(beta+alpha+n),
                -inf)
コード例 #32
0
ファイル: distributions.py プロジェクト: ESRogs/mcex
 def dist(value):
     return switch(ge(value , 0) & gt(alpha , 0) & gt(beta , 0),
               -gammaln(alpha) + alpha*log(beta) - beta*value + switch(alpha != 1.0, (alpha - 1.0)*log(value), 0),
               -inf)
コード例 #33
0
ファイル: distributions.py プロジェクト: ESRogs/mcex
 def dist(value):
     return switch(ge(p , 0) & le(p , 1), 
               switch(value, log(p), log(1-p)),
               -inf)
コード例 #34
0
    def __init__(self, model, state, data):
        """
        :type model: groundhog model class
        :param model: class depicting the model to be optimized

        :type state: dictionary or jobman DD object
        :param state: dictionary containing various hyper-parameters. The
            class will write into this dictionary updates like the current
            training error and so on

        :type data: groundhog dataset object
        :param data: data iterator over which training is done
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        bs = state['bs']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))
        self.gs = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name) for p in model.params
        ]
        self.step = 0
        self.bs = bs
        self.state = state
        self.data = data
        self.step_timer = time.time()
        self.gdata = [
            theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype),
                          name=x.name) for x in model.inputs
        ]

        if 'profile' not in self.state:
            self.state['profile'] = 0

        ###################################
        # Step 1. Compile training function
        ###################################
        print 'Constructing grad function'
        loc_data = self.gdata
        lr = TT.scalar('lr')
        self.prop_exprs = [x[1] for x in model.properties]
        self.prop_names = [x[0] for x in model.properties]
        self.update_rules = [x[1] for x in model.updates]
        rval = theano.clone(model.param_grads + self.update_rules + \
                            self.prop_exprs + [model.train_cost],
                            replace=zip(model.inputs, loc_data))
        nparams = len(model.params)
        nouts = len(self.prop_exprs)
        nrules = len(self.update_rules)
        gs = rval[:nparams]
        rules = rval[nparams:nparams + nrules]
        outs = rval[nparams + nrules:]

        # Clip the momentum-applied gradient
        moment_gs = [s * state['moment'] + g for s, g in zip(self.gs, gs)]
        norm_gs = TT.sqrt(
            sum(
                TT.sum(x**2) for x, p in zip(moment_gs, self.model.params)
                if p not in self.model.exclude_params_for_norm))
        if 'cutoff' in state and state['cutoff'] > 0:
            c = numpy.float32(state['cutoff'])
            if state['cutoff_rescale_length']:
                c = c * TT.cast(loc_data[0].shape[0], 'float32')

            notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs))
            _gs = []
            for g, p in zip(moment_gs, self.model.params):
                if p not in self.model.exclude_params_for_norm:
                    tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g)
                    _gs.append(
                        TT.switch(notfinite,
                                  numpy.float32(.1) * p, tmpg))
                else:
                    _gs.append(g)
            gs = _gs

        store_gs = [(s, g) for s, g in zip(self.gs, gs)]
        updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)]
        print 'Compiling grad function'
        st = time.time()
        self.train_fn = theano.function([],
                                        outs,
                                        name='train_function',
                                        updates=updates,
                                        givens=zip(model.inputs, loc_data),
                                        profile=self.state['profile'])
        print 'took', time.time() - st

        self.lr = numpy.float32(state['lr'])
        new_params = [
            p - s * lr * g
            for s, p, g in zip(model.params_grad_scale, model.params, self.gs)
        ]
        self.update_fn = theano.function([lr], [],
                                         name='update_function',
                                         allow_input_downcast=True,
                                         updates=zip(model.params, new_params),
                                         profile=self.state['profile'])

        self.old_cost = 1e20
        self.schedules = model.get_schedules()
        self.return_names = self.prop_names + \
                ['cost',
                 'time_step',
                 'whole_time',
                  'lr']
コード例 #35
0
d = we_it.embedding_dim
input_var = T.matrix('input')
target_var = T.matrix('targer')

l_in = lasagne.layers.InputLayer(shape=(None, d), input_var=input_var)
l_hid1 = lasagne.layers.DenseLayer(l_in,
                                   num_units=NUM_HIDDEN1,
                                   nonlinearity=lasagne.nonlinearities.rectify,
                                   W=lasagne.init.GlorotUniform())
l_out = lasagne.layers.DenseLayer(l_hid1,
                                  num_units=1,
                                  nonlinearity=lasagne.nonlinearities.sigmoid)

prediction = lasagne.layers.get_output(l_out)
loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean()
accuracy = T.eq(T.ge(prediction, 0.5), target_var).mean()

params = lasagne.layers.get_all_params(l_out, trainable=True)
updates = lasagne.updates.adam(loss, params, learning_rate=0.001)

print >> sys.stderr, 'Compiling...'
train_fn = theano.function([input_var, target_var], [loss, accuracy],
                           updates=updates)

X = np.zeros((2 * HALF_BATCH_SIZE, d), dtype=theano.config.floatX)
target_mat = np.vstack(
    [np.zeros((HALF_BATCH_SIZE, 1)),
     np.ones((HALF_BATCH_SIZE, 1))]).astype(theano.config.floatX)


def train_batch(batch_id=1, print_every_n=1):
コード例 #36
0
ファイル: distributions.py プロジェクト: ESRogs/mcex
 def dist(value):
     return switch(ge(value , 0) & le(value , 1) &
               gt(alpha , 0) & gt(beta , 0),
               gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta) + (alpha- 1)*log(value) + (beta-1)*log(1-value),
               -inf)
コード例 #37
0
ファイル: utils.py プロジェクト: arindam-halder/bio-cnn
def elu(X):
    return T.switch(T.ge(X, 0), X, T.exp(X) - 1.)
コード例 #38
0
    def __init__(self, model, state, data):
        """
        Parameters:
            :param model:
                Class describing the model used. It should provide the
                 computational graph to evaluate the model, and have a
                 similar structure to classes on the models folder
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        if 'adarho' not in state:
            state['adarho'] = 0.96
        if 'adaeps' not in state:
            state['adaeps'] = 1e-6

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        bs = state['bs']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))
        self.gs = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name) for p in model.params
        ]
        self.gnorm2 = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name + '_g2') for p in model.params
        ]
        self.dnorm2 = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name + '_d2') for p in model.params
        ]

        self.step = 0
        self.whole_time = 0.0
        self.bs = bs
        self.state = state
        self.data = data
        self.step_timer = time.time()
        self.gdata = [
            theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype),
                          name=x.name) for x in model.inputs
        ]
        #training dataset stored in gpu. They are defined as shared variables from the
        #'inputs' variable in the encoder-decoder model.
        if 'profile' not in self.state:
            self.state['profile'] = 0

        ###################################
        # Step 1. Compile training function
        ###################################
        logger.debug('Constructing grad function')
        loc_data = self.gdata
        self.prop_exprs = [x[1] for x in model.properties]
        self.prop_names = [x[0] for x in model.properties]
        self.update_rules = [x[1] for x in model.updates]
        rval = theano.clone(model.param_grads + self.update_rules + \
                            self.prop_exprs + [model.train_cost],
                            replace={k:v for k, v in zip(model.inputs, loc_data)})
        nparams = len(model.params)
        nouts = len(self.prop_exprs)
        nrules = len(self.update_rules)
        gs = rval[:nparams]
        rules = rval[nparams:nparams + nrules]
        outs = rval[nparams + nrules:]

        norm_gs = TT.sqrt(
            sum(
                TT.sum(x**2) for x, p in zip(gs, self.model.params)
                if p not in self.model.exclude_params_for_norm))
        if 'cutoff' in state and state['cutoff'] > 0:
            c = numpy.float32(state['cutoff'])
            if state['cutoff_rescale_length']:
                c = c * TT.cast(loc_data[0].shape[0], 'float32')

            notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs))
            _gs = []
            for g, p in zip(gs, self.model.params):
                if p not in self.model.exclude_params_for_norm:
                    tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g)
                    _gs.append(
                        TT.switch(notfinite,
                                  numpy.float32(.1) * p, tmpg))
                else:
                    _gs.append(g)
            gs = _gs
        store_gs = [(s, g) for s, g in zip(self.gs, gs)]
        updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)]

        rho = self.state['adarho']
        eps = self.state['adaeps']

        # grad2
        gnorm2_up = [
            rho * gn2 + (1. - rho) * (g**2.)
            for gn2, g in zip(self.gnorm2, gs)
        ]
        updates = updates + [(gn1, gn2)
                             for gn1, gn2 in zip(self.gnorm2, gnorm2_up)]

        logger.debug('Compiling grad function')
        st = time.time()
        self.train_fn = theano.function([],
                                        outs,
                                        name='train_function',
                                        updates=updates,
                                        givens=zip(model.inputs, loc_data))
        logger.debug('took {}'.format(time.time() - st))

        self.lr = numpy.float32(1.)
        new_params = [
            p - (TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g for p, g, gn2,
            dn2 in zip(model.params, self.gs, self.gnorm2, self.dnorm2)
        ]

        updates = [(a, b) for a, b in zip(model.params, new_params)]
        # d2
        d2_up = [(dn2, rho * dn2 + (1. - rho) *
                  (((TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g)**2.))
                 for dn2, gn2, g in zip(self.dnorm2, self.gnorm2, self.gs)]
        updates = updates + d2_up

        self.update_fn = theano.function([], [],
                                         name='update_function',
                                         allow_input_downcast=True,
                                         updates=updates)

        self.old_cost = 1e20
        self.schedules = model.get_schedules()
        self.return_names = self.prop_names + \
                ['cost',
                'error',
                'time_step',
                'whole_time', 'lr']
        self.prev_batch = None
コード例 #39
0
def SignTheano(x):
    return T.cast(2.*T.ge(x,0)-1., theano.config.floatX)
コード例 #40
0
    def build(self):
        # local graph context
        g_sym = T.imatrix('g')  # a pair of node index (an edge)
        gy_sym = T.vector(
            'gy')  # label of a pair (indicating whether it is a false edge)
        l_g_in = lasagne.layers.InputLayer(shape=(None, 2), input_var=g_sym)
        # l_gy_in = lasagne.layers.InputLayer(shape=(None,), input_var=gy_sym)
        # embedding of node i (pivot node)
        l_emb_local_i = lasagne.layers.SliceLayer(l_g_in, indices=0, axis=1)
        l_emb_local_i = lasagne.layers.EmbeddingLayer(
            l_emb_local_i,
            input_size=self.num_nodes,
            output_size=self.embedding_dim)
        # embedding of node j (context node)
        l_emb_local_j = lasagne.layers.SliceLayer(l_g_in, indices=1, axis=1)
        l_emb_local_j = lasagne.layers.EmbeddingLayer(
            l_emb_local_j,
            input_size=self.num_nodes,
            output_size=self.embedding_dim)
        l_gy = lasagne.layers.ElemwiseMergeLayer(
            [l_emb_local_i, l_emb_local_j], T.mul)
        pgy_sym = lasagne.layers.get_output(l_gy)
        g_loss = -T.log(T.nnet.sigmoid(T.sum(pgy_sym, axis=1) * gy_sym)).sum()
        g_params = lasagne.layers.get_all_params(l_gy, trainable=True)
        g_updates = lasagne.updates.sgd(g_loss,
                                        g_params,
                                        learning_rate=self.g_learning_rate)
        self.graph_fn = theano.function([g_sym, gy_sym],
                                        g_loss,
                                        updates=g_updates,
                                        on_unused_input='warn')

        self.embedding = l_emb_local_i.W

        # local attributes
        ind_sym = T.ivector('ind')
        l_ind_in = lasagne.layers.InputLayer(shape=(None, ), input_var=ind_sym)
        # embedding of current node
        l_emb_f = lasagne.layers.EmbeddingLayer(l_ind_in,
                                                input_size=self.num_nodes,
                                                output_size=self.embedding_dim,
                                                W=self.embedding)
        x_sym = {}
        y_sym = T.vector('y')
        l_x_in = {}
        l_x_hid = {}
        attr_loss = {}
        for n in self.schema["nodes"]:
            x_sym[n] = sparse.csr_matrix(n, dtype='float32')
            l_x_in[n] = lasagne.layers.InputLayer(
                shape=(None, self.schema["nodes"][n]), input_var=x_sym[n])
            l_x_hid[n] = layers.SparseLayer(l_x_in[n], self.embedding_dim)
            l_ay = lasagne.layers.ElemwiseMergeLayer([l_x_hid[n], l_emb_f],
                                                     T.mul)
            pay_sym = lasagne.layers.get_output(l_ay)
            attr_loss[n] = -T.log(
                T.nnet.sigmoid(T.sum(pay_sym, axis=1) * y_sym)).sum()
            attr_params = lasagne.layers.get_all_params(l_ay, trainable=True)
            attr_updates = lasagne.updates.sgd(
                attr_loss[n], attr_params, learning_rate=self.g_learning_rate)
            self.attr_fn[n] = theano.function([x_sym[n], y_sym, ind_sym],
                                              attr_loss[n],
                                              updates=attr_updates,
                                              on_unused_input='warn')

        # alignment
        anchor_sym = T.imatrix('anchor')
        anchor_y_sym = T.vector('anchor_y')
        l_a_in = lasagne.layers.InputLayer(shape=(None, 2),
                                           input_var=anchor_sym)
        l_emb_anchor_i = lasagne.layers.SliceLayer(l_a_in, indices=0, axis=1)
        l_emb_anchor_i = lasagne.layers.EmbeddingLayer(
            l_emb_anchor_i,
            input_size=self.num_nodes,
            output_size=self.embedding_dim,
            W=self.embedding)
        l_emb_anchor_j = lasagne.layers.SliceLayer(l_a_in, indices=1, axis=1)
        l_emb_anchor_j = lasagne.layers.EmbeddingLayer(
            l_emb_anchor_j,
            input_size=self.num_nodes,
            output_size=self.embedding_dim,
            W=self.embedding)
        l_anchor_y = lasagne.layers.ElemwiseMergeLayer(
            [l_emb_anchor_i, l_emb_anchor_j], T.mul)
        p_anchor_y_sym = lasagne.layers.get_output(l_anchor_y)
        anchor_loss = -T.log(
            T.nnet.sigmoid(
                T.sum(p_anchor_y_sym, axis=1) * anchor_y_sym)).sum()
        anchor_params = lasagne.layers.get_all_params(l_anchor_y,
                                                      trainable=True)
        anchor_updates = lasagne.updates.sgd(
            anchor_loss, anchor_params, learning_rate=self.g_learning_rate)
        self.anchor_fn = theano.function([anchor_sym, anchor_y_sym],
                                         anchor_loss,
                                         updates=anchor_updates,
                                         on_unused_input='warn')

        tp_anchor_y_sym = lasagne.layers.get_output(l_anchor_y,
                                                    deterministic=True)
        tp_anchor_y_sym = T.sum(tp_anchor_y_sym, axis=1)
        acc = T.mean(T.eq(T.ge(tp_anchor_y_sym, 0), anchor_y_sym))
        self.test_fn = theano.function([anchor_sym, anchor_y_sym],
                                       acc,
                                       on_unused_input='warn')
コード例 #41
0
ファイル: constraints.py プロジェクト: zhipengChen/EasyNN
 def __call__(self, p):
     p = theano.shared(p)
     p *= T.ge(p, 0.)
     return p
コード例 #42
0
ファイル: optimizers.py プロジェクト: dreasysnail/SGMGT
def SGMGNHT_2(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, resamp = 50, clip_norm=1):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    #rng = np.random.RandomState(3435)
    #+ rng.normal(0,1,p0.shape())
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. +1e-1, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 10.0, name='%s_xi'%k) 
    
    #a = theano.shared(numpy_floatX(2.))
    # m = theano.shared(numpy_floatX(1.))
    # c = theano.shared(numpy_floatX(1.))
    # sigma_p = theano.shared(numpy_floatX(10.))
    # sigma_xi = theano.shared(numpy_floatX(0.01))
    # sigma_theta = theano.shared(numpy_floatX(0.1))
    # gamma = theano.shared(numpy_floatX(1.))
    
    m = theano.shared(numpy_floatX(1.))
    c = theano.shared(numpy_floatX(3.))
    sigma_p = theano.shared(numpy_floatX(0.01))
    sigma_mom = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(0.01))
    gamma = theano.shared(numpy_floatX(1.0))
    
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a = 1, m {} c {} s_p{} s_mom{} s_xi{} g_xi{}'.format( m.get_value(), c.get_value(), sigma_p.get_value(), sigma_mom.get_value(), sigma_xi.get_value(), gamma.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    
    # clip norm
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
        
        g_f = (tensor.sqrt(tensor.abs_(mom+1e-100)))/m
        K_f = g_f + 4/c/(1 + tensor.exp(c*g_f))
        
        psi_f_1 = -1 + 2/( 1 + tensor.exp(-c*g_f))
        f1_f_1 = 1/2.0/m**2 *psi_f_1**2 /g_f*tensor.sgn(mom)
        #f1_f_1 = 1/2.0/m*psi_f_1**2* tensor.abs_(mom+1e-100)**(-1/2) *tensor.sgn(mom)
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  f1_f_1**2 - 1/2.0/m**2 * psi_f_1 * psi_grad_f_1 / tensor.abs_(mom) + 1/4.0/m * psi_f_1**2 * (tensor.abs_(mom+1e-100)**(-1.5))
        
        
#        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
#        f1_f = 1/2/m*psi_f**2 * (tensor.abs_(mom+1e-100)**(-1/2))*tensor.sgn(mom)
#        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
#        f3_f =  f1_f**2 - c/2/m**2 * psi_f * psi_grad_f / tensor.abs_(mom) + 1/4/m * psi_f**2 * (tensor.abs_(mom+1e-100)**(-3/2))
 
#        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
#        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       

        temp_f1 = f1_f_1
        temp_f3 = f3_f_1
        
        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_mom = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')

        # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2)
        noise_temp = tensor.zeros(p.get_value().shape)
        for aa in xrange(4):
            this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')
            noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2)
        randmg = (noise_temp*m/2)**2*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32'))    

        updated_p = p +  temp_f1 * lr - g * lr * ntrain * sigma_p + tensor.sqrt(2*sigma_p*lr) * noise_p
        updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_mom*lr) * noise_mom)* (1-tensor.eq(tensor.mod(iterations,resamp),0)) + randmg * tensor.eq(tensor.mod(iterations,resamp),0)
        #updated_mom = mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p
        temp_xi = trng.normal(p.get_value().shape, avg = sigma_mom, std = tensor.sqrt(sigma_xi/2) , dtype='float32')
        updated_xi = (xi + temp_f3* gamma * lr - (xi - sigma_mom)*sigma_xi/(gamma+1e-10)*lr + tensor.sqrt(2*sigma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,resamp),resamp/2)) + temp_xi * tensor.eq(tensor.mod(iterations,resamp),resamp/2)

        updates.append((p, updated_p))
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates)
    #f_params = theano.function([], [a, m, c, mom.shape])
    return f_grad_shared, f_update
コード例 #43
0
def clip_norm(g, c, n):
    if c > 0:
        g = T.switch(T.ge(n, c), g * c / n, g)
    return g
コード例 #44
0
 def error(self, y, threshold=0.5):
     return tensor.mean(
         tensor.eq(tensor.ge(self.prediction(), threshold), y))
コード例 #45
0
ファイル: distributions.py プロジェクト: ESRogs/mcex
 def dist(value):
     return switch (ge(value , 0) & ge(n , value) & ge(p , 0) & le(p , 1),
                switch(ne(value , 0) , value*log(p), 0) + (n-value)*log(1-p) + factln(n)-factln(value)-factln(n-value),
                -inf)
コード例 #46
0
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
        """Return the shape of the output from this op, for input of given
        shape and flags.

        Parameters
        ----------
        imgshape : tuple of integers or scalar Theano variables
            the shape of a tensor of images. The last two elements are
            interpreted as the number of rows, and the number of cols.
        ds : tuple of two ints
            downsample factor over rows and columns this parameter
            indicates the size of the pooling region
        st : tuple of two ints
            the stride size. This is the distance between the pooling
            regions. If it's set to None, in which case it equlas ds.
        ignore_border : bool
            if ds doesn't divide imgshape, do we include an extra
            row/col of partial downsampling (False) or ignore it
            (True).
        padding : tuple of two ints
            (pad_h, pad_w), pad zeros to extend beyond four borders of
            the images, pad_h is the size of the top and bottom
            margins, and pad_w is the size of the left and right
            margins.

        Returns
        -------
        list :
            the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but
            with last two elements reduced as per the downsampling &
            ignore_border flags.

        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '
                            '(rows, cols)')

        if st is None:
            st = ds
        r, c = imgshape[-2:]
        r += padding[0] * 2
        c += padding[1] * 2

        if ignore_border:
            out_r = (r - ds[0]) // st[0] + 1
            out_c = (c - ds[1]) // st[1] + 1
            if isinstance(r, theano.Variable):
                nr = tensor.maximum(out_r, 0)
            else:
                nr = numpy.maximum(out_r, 0)
            if isinstance(c, theano.Variable):
                nc = tensor.maximum(out_c, 0)
            else:
                nc = numpy.maximum(out_c, 0)
        else:
            if isinstance(r, theano.Variable):
                nr = tensor.switch(
                    tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1,
                    tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1)
            elif st[0] >= ds[0]:
                nr = (r - 1) // st[0] + 1
            else:
                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1

            if isinstance(c, theano.Variable):
                nc = tensor.switch(
                    tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1,
                    tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1)
            elif st[1] >= ds[1]:
                nc = (c - 1) // st[1] + 1
            else:
                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1

        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
コード例 #47
0
ファイル: optimizers.py プロジェクト: dreasysnail/SGMGT
def SGMGHMC_old(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, a_i = 2, clip_norm=5):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) 
    
    a = theano.shared(numpy_floatX(1.))
    m = theano.shared(numpy_floatX(1.))
    c = theano.shared(numpy_floatX(5.))
    sigma_p = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(1.))
    gamma_xi = theano.shared(numpy_floatX(0.001))
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
        
        g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a))
        K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f)))
        
        psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) )
        f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
        
        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
        f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
        f3_f =  1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
 
        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       


        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2)
        noise_temp = tensor.zeros(p.get_value().shape)
        for aa in xrange(a_i*2):
            this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')
            noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2)
        randmg = (noise_temp*m/2)**a*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32'))    

        
        updated_p = p +  temp_f1 * lr
	updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0)
        #updated_mom = mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p
        temp_xi = trng.normal(p.get_value().shape, avg = sigma_p, std = tensor.sqrt(sigma_xi/2) , dtype='float32')
        updated_xi = (xi + temp_f3* sigma_xi * lr - (xi - sigma_p)*gamma_xi*lr + tensor.sqrt(2*sigma_xi*gamma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,100),50)) + temp_xi * tensor.eq(tensor.mod(iterations,100),50)
        

        updates.append((p, updated_p))
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates)
    #f_params = theano.function([], [a, m, c, mom.shape])
    return f_grad_shared, f_update
コード例 #48
0
def huber_loss(y_hat, target, delta=1, center=0, std=1):

    l1_diff = abs((target - center - y_hat) / std)
    huber_loss = TT.switch(TT.ge(l1_diff, delta), (2 * l1_diff - 1) * delta,
                           l1_diff**2)
    return huber_loss
コード例 #49
0
ファイル: utils.py プロジェクト: arindam-halder/bio-cnn
def snelu(X):
    scale = 1.0507009873554804934193349852946
    alpha = 1.6732632423543772848170429916717
    return scale * T.switch(T.ge(X, 0), X, alpha * T.exp(X) - alpha)
コード例 #50
0
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids
                          + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids
                          + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(
            unk_ratio(input_word_ids, mask, self._vocab.unk),
            name='unk_ratio')

        # Run the main rnn with combined inputs
        rnn_inputs = self._main_lookup.apply(input_word_ids)

        encoder_rnn_states = self._encoder_rnn.apply(
            tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)),
            mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(
                out_softmax.copy(), name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(
            targets, logits, extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask,
                               "perplexity")

        missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * missing_embs.T[:-1],
                               "perplexity_after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * (1-missing_embs.T[:-1]),
                               "perplexity_after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask,
                                   "perplexity_after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                   mask_targets_has_def,
                                   "perplexity_after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks):
                self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask * mask_targets_has_def,
                                   "perplexity_after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(
                    mask_targets_has_def.T, name='mask_def_emb')

        return costs, updates
コード例 #51
0
 def leaky(self, X):
     return T.switch(T.ge(X, 0), X, self.leak * X)
コード例 #52
0
        gsums = [
            theano.shared(np.zeros_like(param.get_value(borrow=True)))
            for param in net.params
        ]

    cost = net.cost(y) + L2_REG * net.L2_sqr

    gparams = T.grad(cost, net.params)
    updates = OrderedDict()

    # Compute norm of gradients
    norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams]))

    # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011)
    for gparam, param, gsum in zip(gparams, net.params, gsums):
        gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD),
                          gparam / norm * CLIPPING_THRESHOLD,
                          gparam)  # Clipping of gradients
        updates[gsum] = gsum + (gparam**2)
        updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6)))

    train_model = theano.function(inputs=[x, p, y, lr],
                                  outputs=cost,
                                  updates=updates)

    validate_model = theano.function(inputs=[x, p, y], outputs=net.cost(y))

    print("Training...")
    for epoch in range(starting_epoch, MAX_EPOCHS):
        t0 = time()
        total_neg_log_likelihood = 0
コード例 #53
0
ファイル: optimizers.py プロジェクト: dreasysnail/SGMGT
def SGMGHMC_p(tparams, cost, inps, ntrain, lr, rho=0.9, epsilon=1e-6, clip_norm=0.1):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) 
    
    a = theano.shared(numpy_floatX(2.))
    m_org = theano.shared(numpy_floatX(5.))
    c = theano.shared(numpy_floatX(5.))
    sigma_p = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(0.001))
    gamma_xi = theano.shared(numpy_floatX(1))
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m_org.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
    # reset mom
    # counter = theano.shared(numpy_floatX(0.))
    # updates.append((counter,counter+1))
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
	#rms prop
        t = theano.shared(p.get_value() * 0.)
        t_new = rho * t + (1-rho) * g**2
        updates.append((t, t_new))

        m = (tensor.sqrt(t_new) + 1e-10)
        m = m/tensor.max(m)*m_org
        #m = tensor.switch(tensor.ge(m,1*m_org), 1*m_org, m)
        m = tensor.switch(tensor.le(m,m_org*0.01), m_org*0.01, m)
        
        g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a))
        K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f)))
        
        psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) )
        f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
        
        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
        f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
        f3_f =  1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
 
        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       


        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        
        #lr_new = 1 / tensor.sqrt(tensor.abs_(temp_f1)) * lr 
        lr_new = lr
        updated_p = p +  temp_f1 * lr_new
        #updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0)
        updated_mom = mom - 1.2*temp_f1* xi *lr_new  - g * lr_new * ntrain + tensor.sqrt(2*sigma_p*lr_new) * noise_p
        updated_xi = xi + temp_f3* sigma_xi * lr_new - (xi - sigma_p)*gamma_xi*lr_new + tensor.sqrt(2*sigma_xi*gamma_xi*lr_new) * noise_xi 
        

        updates.append((p, updated_p))
            
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain], [p,mom,m], updates=updates)
    
    return f_grad_shared, f_update
コード例 #54
0
 def elu(self, X):
     return T.switch(T.ge(X, 0), X, self.elu_param * (T.exp(X) - 1))
コード例 #55
0
ファイル: theano_backend.py プロジェクト: ajohi/keras
def ge(x, y):
    return T.ge(x, y)
コード例 #56
0
    def RMSprop(self,
                cost,
                params,
                full_params,
                sampled_params,
                sidxs,
                epsilon=1e-6):
        grads = [T.grad(cost=cost, wrt=param) for param in params]
        sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
        updates = OrderedDict()
        if self.grad_cap > 0:
            norm = T.cast(
                T.sqrt(
                    T.sum([
                        T.sum([T.sum(g**2) for g in g_list])
                        for g_list in grads
                    ]) + T.sum([T.sum(g**2) for g in sgrads])),
                theano.config.floatX)
            grads = [[
                T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                         g) for g in g_list
            ] for g_list in grads]
            sgrads = [
                T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                         g) for g in sgrads
            ]
        for p_list, g_list in zip(params, grads):
            for p, g in zip(p_list, g_list):
                if self.adapt:
                    if self.adapt == 'adagrad':
                        g = self.adagrad(p, g, updates)
                    if self.adapt == 'rmsprop':
                        g = self.rmsprop(p, g, updates)
                    if self.adapt == 'adadelta':
                        g = self.adadelta(p, g, updates)
                    if self.adapt == 'adam':
                        g = self.adam(p, g, updates)
                if self.momentum > 0:
                    velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                             borrow=True)
                    velocity2 = self.momentum * velocity - np.float32(
                        self.learning_rate) * (g + self.lmbd * p)
                    updates[velocity] = velocity2
                    updates[p] = p + velocity2
                else:
                    updates[p] = p * np.float32(1.0 - self.learning_rate *
                                                self.lmbd) - np.float32(
                                                    self.learning_rate) * g

        fgrads = [
            T.grad(cost=cost, wrt=full_param) for full_param in full_params
        ]
        for p, g in zip(full_params, fgrads):
            if self.adapt:
                if self.adapt == 'adagrad':
                    g = self.adagrad(p, g, updates)
                if self.adapt == 'rmsprop':
                    g = self.rmsprop(p, g, updates)
                if self.adapt == 'adadelta':
                    g = self.adadelta(p, g, updates)
                if self.adapt == 'adam':
                    g = self.adam(p, g, updates)
            if self.momentum > 0:
                velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                         borrow=True)
                velocity2 = self.momentum * velocity - np.float32(
                    self.learning_rate) * (g + self.lmbd * p)
                updates[velocity] = velocity2
                updates[p] = p + velocity2
            else:
                updates[p] = p * np.float32(1.0 - self.learning_rate *
                                            self.lmbd) - np.float32(
                                                self.learning_rate) * g
        '''
        for i in range(len(sgrads)):
            g = sgrads[i]
            fullP = full_params[i]
            sample_idx = sidxs[i]
            sparam = sampled_params[i]
            if self.adapt:
                if self.adapt == 'adagrad':
                    g = self.adagrad(fullP, g, updates, sample_idx)
                if self.adapt == 'rmsprop':
                    g = self.rmsprop(fullP, g, updates, sample_idx)
                if self.adapt == 'adadelta':
                    g = self.adadelta(fullP, g, updates, sample_idx)
                if self.adapt == 'adam':
                    g = self.adam(fullP, g, updates, sample_idx)
            if self.lmbd > 0:
                delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
            else:
                delta = np.float32(self.learning_rate) * g
            if self.momentum > 0:
                velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
                vs = velocity[sample_idx]
                velocity2 = self.momentum * vs - delta
                updates[velocity] = T.set_subtensor(vs, velocity2)
                updates[fullP] = T.inc_subtensor(sparam, velocity2)
            else:
                updates[fullP] = T.inc_subtensor(sparam, - delta)
	'''
        return updates
コード例 #57
0
n_steps = tt.iscalar("generator/n_steps")
tau = tt.fscalar("generator/gumbel/tau")

# Generator's input variables for the Encoder
v_gen_input = tt.itensor3(name="generator/input")

# Generator's embedding subnetwork readout for the Encoder
v_gen_embed = lasagne.layers.get_output(l_embed_char, v_gen_input)

# Freeze the hidden inputs of the decoder layers, which do not tap into the encoder.
for layer in dec_rnn_layers:
    GRULayer_freeze(layer, v_gen_input)

# Readout the last state from the encoder.
inputs = {l_encoder_embed: v_gen_embed,
          l_encoder_mask: tt.ge(v_gen_input, 0)}
inputs[l_stack_aug_mask] = tt.gt(tt.sum(inputs[l_encoder_mask], axis=-1), 0)

outputs = [l.hid_init for l in dec_rnn_layers]

dec_hid_inits = lasagne.layers.get_output(outputs, inputs,
                                          deterministic=True)

# Prepare the initial values fed into the scan loop of the Generator
h_0 = tt.concatenate(dec_hid_inits, axis=-1)

x_0 = tt.fill(tt.zeros((v_gen_input.shape[0],), dtype="int32"),
              vocab.index("\x02"))
x_0 = lasagne.layers.get_output(l_embed_char, x_0)

m_0 = tt.ones((v_gen_input.shape[0],), 'bool')
コード例 #58
0
 def __init__(self,
              shape,
              input_var=None,
              name=None,
              binary=True,
              deterministic=False,
              threshold=0.5,
              batch_size=100,
              n_bits=-1,
              **kwargs):
     self.rng_mrg = RandomStreams(lasagne.random.get_rng().randint(
         1, 2394349593))
     if binary == False:
         if n_bits == -1:  # no quantization at all
             super(InputLayer, self).__init__(shape=shape,
                                              input_var=input_var,
                                              name=name,
                                              **kwargs)
         else:
             # Normalize to [0 ~ 1 - 2^(-n_bits)]
             input_var_normed = input_var * (1 - 2**(-n_bits))
             if deterministic == False:
                 shape_rand = list(shape)
                 if shape_rand[0] is None:
                     shape_rand[0] = batch_size
                 shape_rand = tuple(shape_rand)
                 input_var_ceil = T.ceil(
                     input_var_normed * 2**n_bits) / 2**n_bits
                 input_var_floor = T.floor(
                     input_var_normed * 2**n_bits) / 2**n_bits
                 input_var_above_floor = input_var - input_var_floor
                 input_var_stochastic_quantized = T.cast(
                     T.switch(
                         T.ge(
                             input_var_above_floor,
                             self.rng_mrg.uniform(
                                 shape_rand,
                                 low=0.0,
                                 high=2**(-n_bits),
                                 dtype=theano.config.floatX)),
                         input_var_ceil, input_var_floor),
                     theano.config.floatX)
                 super(InputLayer, self).__init__(
                     shape=shape,
                     input_var=input_var_stochastic_quantized,
                     name=name,
                     **kwargs)
             else:
                 input_var_deterministic_quantized = T.cast(
                     T.round(input_var_normed * 2**n_bits) / 2**n_bits,
                     theano.config.floatX)
                 super(InputLayer, self).__init__(
                     shape=shape,
                     input_var=input_var_deterministic_quantized,
                     name=name,
                     **kwargs)
     else:
         if deterministic == False:
             shape_rand = list(shape)
             if shape_rand[0] is None:
                 shape_rand[0] = batch_size
             shape_rand = tuple(shape_rand)
             # Bernoulli spikes
             input_var_stochastic_binarized = T.cast(
                 T.gt(
                     input_var,
                     self.rng_mrg.uniform(shape_rand,
                                          low=0.0,
                                          high=1.0,
                                          dtype=theano.config.floatX)),
                 theano.config.floatX)
             super(InputLayer,
                   self).__init__(shape=shape,
                                  input_var=input_var_stochastic_binarized,
                                  name=name,
                                  **kwargs)
         else:
             input_var_deterministic_binarized = T.cast(
                 T.switch(T.ge(input_var, threshold), 1.0, 0.),
                 theano.config.floatX)
             super(InputLayer, self).__init__(
                 shape=shape,
                 input_var=input_var_deterministic_binarized,
                 name=name,
                 **kwargs)
コード例 #59
0
ファイル: minres.py プロジェクト: yo-ga/TextDetector
    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1,
                      r3) for r3, r1 in zip(r3s, r1s)
        ]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, constantX(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, constantX(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds), constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, constantX(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag,
                                                           constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)), constantX(3),
                            TT.le(t2, constantX(1)), constantX(4),
                            TT.le(relrnorm, rtol), constantX(1),
                            TT.le(Anorm, constantX(1e-20)), constantX(12),
                            TT.le(relArnorml, rtol), constantX(10),
                            TT.ge(epsx, beta1), constantX(5),
                            TT.ge(xnorm, maxxnorm), constantX(6),
                            TT.ge(niter, TT.cast(maxit, theano.config.floatX)),
                            constantX(8), flag), flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))
コード例 #60
0
def clip_gradients(gradients, grad_clip=5., hard_clip=False):
    """
    This returns the gradient parameters clipped according to the grad_clip value given in initialization.

    As described here: http://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/

    Code mostly taken from https://github.com/kastnerkyle/minet/blob/master/minet/net.py

    Based on:

    Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training
            recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012).

    Parameters
    ----------
    gradients : dict
        A dictionary mapping from the model's parameters to their
        gradients.
    grad_clip : float, optional
        How much to clip gradients (if at all).
    hard_clip : bool
        Whether to use hard clipping (keeping gradients at grad_clip level), or soft clipping (rescaling based
        on grad_clip).

    Returns
    -------
    clipgrads : dict
        A dictionary mapping from the model's parameters to their correctly clipped
        gradients. (If no self.grad_clip, this just returns the original `gradients` input parameter).
    """
    if grad_clip:
        gradients = gradients.items()
        params = [item[0] for item in gradients]
        grads = [item[1] for item in gradients]

        # Gradient clipping
        grad_norm = T.sqrt(sum([T.sqr(grad).sum() for grad in grads]))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = grad_clip
        scaling_den = T.maximum(grad_clip, grad_norm)

        if hard_clip:
            # do the NaN/inf trick
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad)
                     for param, grad in gradients]
            # hard clip gradients above or below grad_clip to be = grad_clip
            grads = [T.switch(T.ge(grad_norm, grad_clip),
                              T.sgn(grad) * grad_clip,
                              grad)
                     for grad in grads]
        else:
            # NaN/inf trick combined with scaling.
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad * (scaling_num / scaling_den))
                     for param, grad in gradients]

        clipgrads = OrderedDict(zip(params, grads))
        return clipgrads
    else:
        return gradients