Esempio n. 1
0
def filter_boxes(boxes, min_size):
    """Remove all boxes with any side smaller than min_size."""
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1
    # keep = np.where((ws >= min_size) & (hs >= min_size))[0]
    keep = (T.ge(ws, min_size) & T.ge(hs, min_size)).nonzero()[0]
    return keep
def matrix_noise3d(input_vectors, perm, grad3, vertex_table):
    skew_factors = (input_vectors[:, 0] + input_vectors[:, 1] + input_vectors[:, 2]) * 1.0 / 3.0
    skewed_vectors = T.floor(input_vectors + skew_factors[:, np.newaxis])
    unskew_factors = (skewed_vectors[:, 0] + skewed_vectors[:, 1] + skewed_vectors[:, 2]) * 1.0 / 6.0
    offsets_0 = input_vectors - (skewed_vectors - unskew_factors[:, np.newaxis])
    vertex_table_x_index = T.ge(offsets_0[:, 0], offsets_0[:, 1])
    vertex_table_y_index = T.ge(offsets_0[:, 1], offsets_0[:, 2])
    vertex_table_z_index = T.ge(offsets_0[:, 0], offsets_0[:, 2])
    simplex_vertices = vertex_table[
        vertex_table_x_index,
        vertex_table_y_index,
        vertex_table_z_index].reshape((input_vectors.shape[0], 2, 3))
    offsets_1 = offsets_0 - simplex_vertices[:, 0] + 1.0 / 6.0
    offsets_2 = offsets_0 - simplex_vertices[:, 1] + 1.0 / 3.0
    offsets_3 = offsets_0 - 0.5
    masked_skewed_vectors = T.bitwise_and(skewed_vectors.astype('int32'), 255)
    gi0s = perm[masked_skewed_vectors[:, 0] + perm[
        masked_skewed_vectors[:, 1] + perm[
            masked_skewed_vectors[:, 2]].astype('int32')].astype('int32')] % 12
    gi1s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 0, 0] + perm[
        masked_skewed_vectors[:, 1] + simplex_vertices[:, 0, 1] + perm[
            masked_skewed_vectors[:, 2] + simplex_vertices[:, 0, 2]].astype('int32')].astype('int32')] % 12
    gi2s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 1, 0] + perm[
        masked_skewed_vectors[:, 1] + simplex_vertices[:, 1, 1] + perm[
            masked_skewed_vectors[:, 2] + simplex_vertices[:, 1, 2]].astype('int32')].astype('int32')] % 12
    gi3s = perm[masked_skewed_vectors[:, 0] + 1 + perm[
        masked_skewed_vectors[:, 1] + 1 + perm[
            masked_skewed_vectors[:, 2] + 1].astype('int32')].astype('int32')] % 12
    n0s = calculate_gradient_contribution(offsets_0, gi0s, grad3)
    n1s = calculate_gradient_contribution(offsets_1, gi1s, grad3)
    n2s = calculate_gradient_contribution(offsets_2, gi2s, grad3)
    n3s = calculate_gradient_contribution(offsets_3, gi3s, grad3)
    return 23.0 * (n0s + n1s + n2s + n3s)
Esempio n. 3
0
File: uniform.py Progetto: ibab/carl
    def __init__(self, random_state=None, low=0.0, high=1.0):
        super(Uniform, self).__init__(low=low, high=high,
                                      random_state=random_state,
                                      optimizer=None)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self.make_(self.pdf_, "pdf")

        # -log pdf
        self.nnlf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self.make_(self.nnlf_, "nnlf")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self.make_(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self.make_(self.ppf_, "ppf", args=[self.p])
Esempio n. 4
0
	def _step_test(self,
			  x_t, xi_t, xf_t, xo_t, xc_t, mask_tm1,
			  pred1_tm1, pred2_tm1, pred3_tm1, pred4_tm1, h_tm1, c_tm1, ctx_tm1, 
			  u_i, u_f, u_o, u_c, x_encoder, attention_encoder, x_img, B_W, B_U, B_Wimg, B_Wctx):

		outer1 = pred1_tm1[:, :, np.newaxis] * pred2_tm1[:, np.newaxis, :]
		outer1 =  outer1.reshape((outer1.shape[0],-1))
		outer2 = pred3_tm1[:, :, np.newaxis] * pred4_tm1[:, np.newaxis, :]
		outer2 =  outer2.reshape((outer2.shape[0],-1))
		pred = outer1[:, :, np.newaxis] * outer2[:, np.newaxis, :]
		pred =	pred.reshape((pred.shape[0],-1))
		x_t = self.W_embedding[T.argmax(pred, axis = 1)] * B_W[4]

		h_mask_tm1 = mask_tm1 * h_tm1
		c_mask_tm1 = mask_tm1 * c_tm1

		attention_x = T.dot(x_t, self.W_x2a)
		attention_total = attention_x[:,None,:] + attention_encoder
		if self.prev_context:
			attention_prev = T.dot(ctx_tm1,self.W_ctx2a)
			attention_total += attention_prev[:,None,:]

		attention_activation = T.dot( T.tanh(attention_total), self.V) # attention -> scores
		attention_alpha = T.nnet.softmax(attention_activation[:,:,0])  # scores -> weights
		ctx_t = (x_encoder * attention_alpha[:,:,None]).sum(axis = 1)  # weighted average of context vectors

		xi_t = T.dot(x_t * B_W[0], self.W_i) + self.b_i + T.dot(x_img * B_Wimg[0], self.Wimg_i) + T.dot(ctx_t * B_Wctx[0], self.Wctx_i)
		xf_t = T.dot(x_t * B_W[1], self.W_f) + self.b_f + T.dot(x_img * B_Wimg[1], self.Wimg_f) + T.dot(ctx_t * B_Wctx[1], self.Wctx_f)
		xc_t = T.dot(x_t * B_W[2], self.W_c) + self.b_c + T.dot(x_img * B_Wimg[2], self.Wimg_c) + T.dot(ctx_t * B_Wctx[2], self.Wctx_c)
		xo_t = T.dot(x_t * B_W[3], self.W_o) + self.b_o + T.dot(x_img * B_Wimg[3], self.Wimg_o) + T.dot(ctx_t * B_Wctx[3], self.Wctx_o)

		i_t = self.inner_activation(xi_t + T.dot(h_mask_tm1 * B_U[0], u_i))
		f_t = self.inner_activation(xf_t + T.dot(h_mask_tm1 * B_U[1], u_f))
		c_t = f_t * c_mask_tm1 + i_t * self.activation(xc_t + T.dot(h_mask_tm1 * B_U[2], u_c))
		o_t = self.inner_activation(xo_t + T.dot(h_mask_tm1 * B_U[3], u_o))
		h_t = o_t * self.activation(c_t)

		pred1_t = T.dot(h_t, self.U_p1) + self.b_p1
		pred1_t = T.nnet.softmax(pred1_t.reshape((-1, pred1_t.shape[-1]))).reshape(pred1_t.shape)

		pred2_t = T.dot(h_t, self.U_p2) + self.b_p2
		pred2_t = T.nnet.softmax(pred2_t.reshape((-1, pred2_t.shape[-1]))).reshape(pred2_t.shape)

		pred3_t = T.dot(h_t, self.U_p3) + self.b_p3
		pred3_t = T.nnet.softmax(pred3_t.reshape((-1, pred3_t.shape[-1]))).reshape(pred3_t.shape)

		pred4_t = T.dot(h_t, self.U_p4) + self.b_p4
		pred4_t = T.nnet.softmax(pred4_t.reshape((-1, pred4_t.shape[-1]))).reshape(pred4_t.shape)

		pred1_t = T.ge(pred1_t, T.max(pred1_t, axis = 1).reshape((pred1_t.shape[0],1)))*1.0
		pred2_t = T.ge(pred2_t, T.max(pred2_t, axis = 1).reshape((pred2_t.shape[0],1)))*1.0
		pred3_t = T.ge(pred3_t, T.max(pred3_t, axis = 1).reshape((pred3_t.shape[0],1)))*1.0
		pred4_t = T.ge(pred4_t, T.max(pred4_t, axis = 1).reshape((pred4_t.shape[0],1)))*1.0

		return pred1_t, pred2_t, pred3_t, pred4_t, h_t, c_t, ctx_t
Esempio n. 5
0
def innerL_(sS, i):
    Ei = calcEk_(sS, i)
    
    # use "+" instead of "or" and "*" instead of "and"
    checkUselessAlpha1 = T.ge(sS.labels[i] * Ei, -sS.tol) + T.ge(sS.alphas[i], sS.C)
    checkUselessAlpha2 = T.le(sS.labels[i]*Ei, sS.tol) + T.lt(sS.alphas[i], 0)
    isUselessAlpha = toTheanoBool(checkUselessAlpha1 * checkUselessAlpha2)
    
    updateL = innerL_alphaInRange_(sS, i, Ei)
    earlyret = sS.retlist(0)
    return ifelse(isUselessAlpha, earlyret, updateL)
Esempio n. 6
0
 def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6):
     grads =  [T.grad(cost = cost, wrt = param) for param in params]
     sgrads = [T.grad(cost = cost, wrt = sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap>0:
         norm=T.cast(T.sqrt(T.sum([T.sum([T.sum(g**2) for g in g_list]) for g_list in grads]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX)
         grads = [[T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in g_list] for g_list in grads]
         sgrads = [T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in sgrads]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt:
                 if self.adapt == 'adagrad':
                     g = self.adagrad(p, g, updates)
                 if self.adapt == 'rmsprop':
                     g = self.rmsprop(p, g, updates)
                 if self.adapt == 'adadelta':
                     g = self.adadelta(p, g, updates)
                 if self.adapt == 'adam':
                     g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt:
             if self.adapt == 'adagrad':
                 g = self.adagrad(fullP, g, updates, sample_idx)
             if self.adapt == 'rmsprop':
                 g = self.rmsprop(fullP, g, updates, sample_idx)
             if self.adapt == 'adadelta':
                 g = self.adadelta(fullP, g, updates, sample_idx)
             if self.adapt == 'adam':
                 g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, - delta)
     return updates
    def compute_nonlinearity_derivative(lin, bias):
        n_h = bias.shape[0]
        lin_re = lin[:, :n_h]
        lin_im = lin[:, n_h:]        
        mod = T.sqrt(lin_re**2 + lin_im**2)

        ind = T.ge(mod + bias.dimshuffle('x', 0), 0)
        opt1 = 1.
        opt2 = 1. / (1 - mod - bias.dimshuffle('x', 0))**2
        ind = T.ge(mod, 1)
        dnonlindlin = T.tile(ind * opt1 + (1-ind) * opt2, [1, 2])         

        return dnonlindlin
Esempio n. 8
0
  def cubicBSpline(self, L):
    b = T.zeros_like(L)

    idx4 = T.ge(L, 0) * T.lt(L, 1)
    idx3 = T.ge(L, 1) * T.lt(L, 2)
    idx2 = T.ge(L, 2) * T.lt(L, 3)
    idx1 = T.ge(L, 3) * T.le(L, 4)

    b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b)
    b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b)
    b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2)           + 4) / 6, b)
    b = T.switch(T.eq(idx1, 1), (-  T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b)
    
    return b.T # b is K x K' and thus, as we multiply from the right with
Esempio n. 9
0
File: rae.py Progetto: zomux/nlpy
    def _decode_step(self, seq, regs):
        left, right, target = seq[0], seq[1], seq[2]

        left_is_not_token = T.ge(left, 0)
        right_is_not_token = T.ge(right, 0)

        rep = regs[target]

        left_dec, right_dec = self._decode_computation(rep)

        regs = ifelse(left_is_not_token, T.set_subtensor(regs[left], left_dec), regs)
        regs = ifelse(right_is_not_token, T.set_subtensor(regs[right], right_dec), regs)

        return  rep, left_dec, right_dec, regs
Esempio n. 10
0
	def __init__(self, input, nfeatures, C):
		""" Initialize the parameters of the SVM
		
		input: theano.tensor.TensorType
			symbolic variable that describes the input of the architecture (one minibatch)
		
		nfeatures: number of input units, the dimension of the space in which the datapoints lie
		
		C: error penalty
		"""
		self.nfeatures = nfeatures
		Wzeros, bzero = self.GetZeroWeights()
		
		#create a column vector with nfeatures rows
		self.W = theano.shared(value=Wzeros, name='W', borrow=True)
		
		# initialize bias: a scalar of the same data type as W
		self.b = theano.shared(bzero, name='b')#, borrow=True)
		
		# initialize the error penalty C
		self.C = C
		
		# hyperplane projection used in classification
		# T.dot(input,self.W) creates a vector of shape (rows,) == (# in minibatch,)
		# adding +self.b broadcasts the bias, adding it to each row, so the result is still of shape (rows,)
		self.hplaneproject = T.dot(input, self.W) + self.b
		
		# symbolic description of how to compute prediction as -1 or 1
		# the function sign() is not in Theano,
		# so I use (x>0)*2-1 using T.ge() which returns 1 when true and 0 when false
		self.y_pred = T.ge(self.hplaneproject, 0)*2 - 1
Esempio n. 11
0
def clip_grad(grads, norm, grad_clip):
    # clip the grads, when over a threshold
    _grads = []
    for g in grads:
        _grads.append( TT.switch(TT.ge(norm, grad_clip), g*grad_clip/norm, g) )
    
    return _grads
Esempio n. 12
0
def huber_loss(y_hat, target, delta=1, center=0, std=1):

    l1_diff = abs((target - center - y_hat) / std)
    huber_loss = TT.switch(TT.ge(l1_diff, delta),
                           (2*l1_diff - 1) * delta,
                           l1_diff**2)
    return huber_loss
Esempio n. 13
0
    def _apply_hard_constraint_on_gradients(self, gradients, threshold=5, l_norm=2):
        """
        Function to apply a hard constraint on the parameter's gradients.

        :param gradients: theano.tensor
            Symbolic representation of the  parameter's gradients.

        :param threshold: int
            The threshold to which apply the constraints. Defaults to 5 (i.e., if the norm exceeds
                5, the constraint is applied.

        :param l_norm: int
            The number of the norm to compute. Defaults to 2 (i.e., L2-norm).

        :return: gradients: theano.tensor
            Symbolic representation of the parameter's gradients with/without the constraint
                applied.

        """

        for g in gradients:  # for all gradients
            g /= self.batch_size  # divide it by the size of the minibatch
            s = g.norm(l_norm)  # compute its norm
            if T.ge(s, threshold):  # if the norm is greater than the threshold
                g = (threshold * g) / s  # replace gradient

        return gradients
Esempio n. 14
0
    def get_gradients(self, model, data, ** kwargs):

        cost = self.expr(model=model, data=data, **kwargs)

        params = list(model.get_params())

        grads = T.grad(cost, params, disconnected_inputs='ignore')

        gradients = OrderedDict(izip(params, grads))

        if self.gradient_clipping:
            norm_gs = 0.
            for grad in gradients.values():
                norm_gs += (grad ** 2).sum()
            not_finite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
            norm_gs = T.sqrt(norm_gs)
            norm_gs = T.switch(T.ge(norm_gs, self.max_magnitude),
                               self.max_magnitude / norm_gs,
                               1.)

            for param, grad in gradients.items():
                gradients[param] = T.switch(not_finite,
                                            .1 * param,
                                            grad * norm_gs)

        updates = OrderedDict()

        return gradients, updates
	def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}):
		self.embedding_dim = embedding_dim
		self.num_hidden_layers = num_hidden_layers
		self.hidden_dim = hidden_dim
		self.in_dropout_p = in_dropout_p
		self.hidden_dropout_p = update_hyperparams
	
		print >> sys.stderr, 'Building computation graph for discriminator...'		
		self.input_var = T.matrix('input')
		self.target_var = T.matrix('targer')

		self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in')
		self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, 0.2)
		self.layers = [self.l_in, self.l_in_dr]
		for i in xrange(self.num_hidden_layers):
			l_hid = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i)))
			l_hid_dr = lasagne.layers.DropoutLayer(l_hid, 0.5)
			self.layers.append(l_hid)
			self.layers.append(l_hid_dr)
		self.l_preout = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=1, nonlinearity=None, name='l_preout'))
		self.l_out = lasagne.layers.NonlinearityLayer(self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out')

		self.prediction = lasagne.layers.get_output(self.l_out)
		self.loss = lasagne.objectives.binary_crossentropy(self.prediction, self.target_var).mean()
		self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean()

		self.params = lasagne.layers.get_all_params(self.l_out, trainable=True)
		self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams)

		print >> sys.stderr, 'Compiling discriminator...'
		self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates)
		self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
Esempio n. 16
0
def Adagrad(tparams, cost, inps, lr, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.01 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)    
    
    updates = []
    
    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_t = acc + g ** 2
        updates.append((acc, acc_t))
        p_t = p - (lr / tensor.sqrt(acc_t + epsilon)) * g
        updates.append((p, p_t))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
Esempio n. 17
0
def pq_theano(y_true, y_pred):
    """
    Theano implementation of Pass Quality function.
    :param y_true:
    :param y_pred:
    :return:
    """
    y_pred = tt.ge(y_pred, tt.mean(y_pred)).T[-1].T
    y_true = y_true.T[-1].T

    tt_diffs = tt.extra_ops.diff(y_true + y_pred)

    tt_r = theano.shared(0., 'r')
    tt_height = theano.shared(0., 'h')
    tt_error = theano.shared(0., 'err')
    tt_current_error = theano.shared(0., 'c_err')
    tt_flag = theano.shared(0., 'flag')

    values, updates = scan(fn=one_step,
                           sequences=[tt_diffs, tt.abs_(tt_diffs)],
                           outputs_info=[tt_error,
                                         tt_r,
                                         tt_height,
                                         tt_current_error,
                                         tt_flag])

    epsilon = 0.0000000001

    tt_ret = (1 - (values[1][-1] + epsilon) / (values[1][-1] +
                                               values[0][-1] +
                                               epsilon))
    return tt_ret
Esempio n. 18
0
def uniq_with_lengths(seq, time_mask):
  """
  :param seq: (time,batch) -> label
  :param time_mask: (time,batch) -> 0 or 1
  :return: out_seqs, seq_lens.
  out_seqs is (max_seq_len,batch) -> label, where max_seq_len <= time.
  seq_lens is (batch,) -> len.
  """
  num_batches = seq.shape[1]
  diffs = T.ones_like(seq)
  diffs = T.set_subtensor(diffs[1:], seq[1:] - seq[:-1])
  time_range = T.arange(seq.shape[0]).dimshuffle([0] + ['x'] * (seq.ndim - 1))
  idx = T.switch(T.neq(diffs, 0) * time_mask, time_range, -1)  # (time,batch) -> idx or -1
  seq_lens = T.sum(T.ge(idx, 0), axis=0)  # (batch,) -> len
  max_seq_len = T.max(seq_lens)

  # I don't know any better way without scan.
  # http://stackoverflow.com/questions/31379971/uniq-for-2d-theano-tensor
  def step(batch_idx, out_seq_b1):
    #out_seq = seq[T.ge(idx[:, batch_idx], 0).nonzero(), batch_idx][0]
    out_seq = seq[:, batch_idx][T.ge(idx[:, batch_idx], 0).nonzero()]
    return T.concatenate((out_seq, T.zeros((max_seq_len - out_seq.shape[0],), dtype=seq.dtype)))

  out_seqs, _ = theano.scan(
    step,
    sequences=[T.arange(num_batches)],
    outputs_info=[T.zeros((max_seq_len,), dtype=seq.dtype)]
  )
  # out_seqs is (batch,max_seq_len)
  return out_seqs.T, seq_lens
Esempio n. 19
0
def pq_theano_f(y_true, y_pred):

    y_pred = tt.ge(y_pred, tt.mean(y_pred))
    y_true = y_true

    tt_diffs = tt.extra_ops.diff(y_true + y_pred)

    # tt_r = tt.shape_padleft(theano.shared(0., 'r'))
    tt_r = theano.shared(0., 'r')
    # tt_height = tt.shape_padleft(theano.shared(0., 'h'))
    tt_height = theano.shared(0., 'h')
    # tt_error = tt.shape_padleft(theano.shared(0., 'err',))
    tt_error = theano.shared(0., 'err')
    # tt_current_error = tt.shape_padleft(theano.shared(0., 'c_err'))
    tt_current_error = theano.shared(0., 'c_err')
    # tt_ret = theano.tensor.col('ret')
    tt_flag = theano.shared(0., 'flag')

    values, updates = scan(fn=one_step,
                           sequences=[tt_diffs, tt.abs_(tt_diffs)],
                           outputs_info=[tt_error,
                                         tt_r,
                                         tt_height,
                                         tt_current_error,
                                         tt_flag])

    # print values[0].type

    epsilon = 0.0000000001
    # print tt.ones_like(values[0]).type
    # print values[1].type
    tt_ret = 1 - (values[1] + epsilon) / (values[1] +
                                          values[0] +
                                          epsilon)
    return tt_ret
Esempio n. 20
0
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin=bins[i]
        if i == 0:
            binned=T.switch(T.lt(x,bin),i,binned)
        else:
            ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin))
            binned=T.switch(ineq,i,binned)
    binned=T.switch(T.isnan(x), len(bins), binned)
    return binned
Esempio n. 21
0
	def logp_loss3(self, x, y, fake_label,neg_label, pos_ratio = 0.5): #adopt maxout  for  negative   
		# pos_rati0  means  pos examples weight (0.5 means  equal 1:1)


		print "adopt  positives  weight  ............. "+str(pos_ratio)
		y = y.dimshuffle((1,0))
		inx = x.dimshuffle((1,0))
		fake_mask = T.neq(y, fake_label)
		y = y*fake_mask

		pos_mask = T.and_(fake_mask, T.le(y, neg_label-1))*pos_ratio
		neg_mask = T.ge(y, neg_label)*(1- pos_ratio)


		pos_score, neg_score = self.structure2(inx,False)
		maxneg = T.max(neg_score, axis = -1)

		scores = T.concatenate((pos_score, maxneg.dimshuffle((0,1,'x'))), axis = 2)

		d3shape = scores.shape

		#seq*batch , label
		scores = scores.reshape((d3shape[0]*d3shape[1],  d3shape[2]))
		pro = T.nnet.softmax(scores)

		_logp = T.nnet.categorical_crossentropy(pro, y.flatten())

		_logp = _logp.reshape(fake_mask.shape)

		loss = (T.sum(_logp*pos_mask)+ T.sum(_logp*neg_mask))/ (T.sum(pos_mask)+T.sum(neg_mask))
		pos_loss = T.sum(_logp*pos_mask)
		neg_loss = T.sum(_logp*neg_mask)


		return loss, pos_loss, neg_loss
Esempio n. 22
0
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Esempio n. 23
0
    def compute_updates(self, training_cost, params):
        updates = []
         
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []
        
        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
         
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))
        
        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)  
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!") 
        return updates
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8):
    """ default: lr=0.0002 """

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g ** 2) for g in grads]))
    if tensor.ge(norm, 5):
        grads = [g * 5 / norm for g in grads]

    gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - b1 ** (i_t)
    fix2 = 1.0 - b2 ** (i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.0)
        v = theano.shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
Esempio n. 25
0
    def __init__(self, input, n_in, n_out, discriminant_threshold):

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
              
        # initialize the basis b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
        
        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
        # edited to reject events below a threshold
        #self.y_pred = T.argmax(self.p_y_given_x, axis=1) 
        #self.y_pred = T.and_(T.argmax(self.p_y_given_x, axis=1), T.ge(self.p_y_given_x[:,1], -1))#discriminant_threshold))
        self.y_pred = T.ge(self.p_y_given_x[:,1], discriminant_threshold)#discriminant_threshold))

        # parameters of the model
        self.params = [self.W, self.b]
Esempio n. 26
0
def Adadelta(tparams, cost, inps, lr, rho=0.95, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.5 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_delta = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc,acc_new)) 
        
        update = g * tensor.sqrt(acc_delta + epsilon) / tensor.sqrt(acc_new + epsilon)
        updated_p = p - lr * update
        updates.append((p, updated_p))
        
        acc_delta_new = rho * acc_delta + (1 - rho) * update ** 2
        updates.append((acc_delta,acc_delta_new))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
Esempio n. 27
0
def RMSprop_v1(tparams, cost, inps, lr, rho=0.9, epsilon=1e-6,clip_norm=5):
    """ default: lr=0.001 
        This is the implementation of the RMSprop algorithm used in
        http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf.
    """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        updates.append((acc, acc_new))
        
        updated_p = p - lr * (g / tensor.sqrt(acc_new + epsilon))
        updates.append((p, updated_p))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update
Esempio n. 28
0
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.0)

    i = shared(floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - (1.0 - b1) ** i_t
    fix2 = 1.0 - (1.0 - b2) ** i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.0)
        v = shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * T.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        # e_t = shared(p.get_value() * 0.)
        # de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        # p_t = p_t + de_t
        # updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm
Esempio n. 29
0
def NAG(tparams, cost, inps, lr, momentum=0.9,clip_norm=5):
    """ default: lr=0.01 """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup) 
    
    updates = []

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        m_new = momentum * m - lr * g
        updates.append((m, m_new))        
        
        updated_p = p + momentum * m_new - lr * g
        updates.append((p, updated_p))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update 
Esempio n. 30
0
def rmax(x):

	xmax  = T.ge(x, T.max(x, axis = 1).reshape((x.shape[0],1)))
	shift = (T.ones_like(x) - xmax) * x
	max2  = T.max(shift,axis = 1).reshape((x.shape[0],1))
	out = T.nnet.relu(x - max2)

	return out
Esempio n. 31
0
 def dist(value):
     return switch (ge(value , 0) & gt(alpha , 0) & gt(beta , 0) & ge(n , value), 
                gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta)+ gammaln(n+1)- gammaln(value+1)- gammaln(n-value +1) + gammaln(alpha+value)+ gammaln(n+beta-value)- gammaln(beta+alpha+n),
                -inf)
Esempio n. 32
0
 def dist(value):
     return switch(ge(value , 0) & gt(alpha , 0) & gt(beta , 0),
               -gammaln(alpha) + alpha*log(beta) - beta*value + switch(alpha != 1.0, (alpha - 1.0)*log(value), 0),
               -inf)
Esempio n. 33
0
 def dist(value):
     return switch(ge(p , 0) & le(p , 1), 
               switch(value, log(p), log(1-p)),
               -inf)
Esempio n. 34
0
    def __init__(self, model, state, data):
        """
        :type model: groundhog model class
        :param model: class depicting the model to be optimized

        :type state: dictionary or jobman DD object
        :param state: dictionary containing various hyper-parameters. The
            class will write into this dictionary updates like the current
            training error and so on

        :type data: groundhog dataset object
        :param data: data iterator over which training is done
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        bs = state['bs']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))
        self.gs = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name) for p in model.params
        ]
        self.step = 0
        self.bs = bs
        self.state = state
        self.data = data
        self.step_timer = time.time()
        self.gdata = [
            theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype),
                          name=x.name) for x in model.inputs
        ]

        if 'profile' not in self.state:
            self.state['profile'] = 0

        ###################################
        # Step 1. Compile training function
        ###################################
        print 'Constructing grad function'
        loc_data = self.gdata
        lr = TT.scalar('lr')
        self.prop_exprs = [x[1] for x in model.properties]
        self.prop_names = [x[0] for x in model.properties]
        self.update_rules = [x[1] for x in model.updates]
        rval = theano.clone(model.param_grads + self.update_rules + \
                            self.prop_exprs + [model.train_cost],
                            replace=zip(model.inputs, loc_data))
        nparams = len(model.params)
        nouts = len(self.prop_exprs)
        nrules = len(self.update_rules)
        gs = rval[:nparams]
        rules = rval[nparams:nparams + nrules]
        outs = rval[nparams + nrules:]

        # Clip the momentum-applied gradient
        moment_gs = [s * state['moment'] + g for s, g in zip(self.gs, gs)]
        norm_gs = TT.sqrt(
            sum(
                TT.sum(x**2) for x, p in zip(moment_gs, self.model.params)
                if p not in self.model.exclude_params_for_norm))
        if 'cutoff' in state and state['cutoff'] > 0:
            c = numpy.float32(state['cutoff'])
            if state['cutoff_rescale_length']:
                c = c * TT.cast(loc_data[0].shape[0], 'float32')

            notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs))
            _gs = []
            for g, p in zip(moment_gs, self.model.params):
                if p not in self.model.exclude_params_for_norm:
                    tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g)
                    _gs.append(
                        TT.switch(notfinite,
                                  numpy.float32(.1) * p, tmpg))
                else:
                    _gs.append(g)
            gs = _gs

        store_gs = [(s, g) for s, g in zip(self.gs, gs)]
        updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)]
        print 'Compiling grad function'
        st = time.time()
        self.train_fn = theano.function([],
                                        outs,
                                        name='train_function',
                                        updates=updates,
                                        givens=zip(model.inputs, loc_data),
                                        profile=self.state['profile'])
        print 'took', time.time() - st

        self.lr = numpy.float32(state['lr'])
        new_params = [
            p - s * lr * g
            for s, p, g in zip(model.params_grad_scale, model.params, self.gs)
        ]
        self.update_fn = theano.function([lr], [],
                                         name='update_function',
                                         allow_input_downcast=True,
                                         updates=zip(model.params, new_params),
                                         profile=self.state['profile'])

        self.old_cost = 1e20
        self.schedules = model.get_schedules()
        self.return_names = self.prop_names + \
                ['cost',
                 'time_step',
                 'whole_time',
                  'lr']
Esempio n. 35
0
d = we_it.embedding_dim
input_var = T.matrix('input')
target_var = T.matrix('targer')

l_in = lasagne.layers.InputLayer(shape=(None, d), input_var=input_var)
l_hid1 = lasagne.layers.DenseLayer(l_in,
                                   num_units=NUM_HIDDEN1,
                                   nonlinearity=lasagne.nonlinearities.rectify,
                                   W=lasagne.init.GlorotUniform())
l_out = lasagne.layers.DenseLayer(l_hid1,
                                  num_units=1,
                                  nonlinearity=lasagne.nonlinearities.sigmoid)

prediction = lasagne.layers.get_output(l_out)
loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean()
accuracy = T.eq(T.ge(prediction, 0.5), target_var).mean()

params = lasagne.layers.get_all_params(l_out, trainable=True)
updates = lasagne.updates.adam(loss, params, learning_rate=0.001)

print >> sys.stderr, 'Compiling...'
train_fn = theano.function([input_var, target_var], [loss, accuracy],
                           updates=updates)

X = np.zeros((2 * HALF_BATCH_SIZE, d), dtype=theano.config.floatX)
target_mat = np.vstack(
    [np.zeros((HALF_BATCH_SIZE, 1)),
     np.ones((HALF_BATCH_SIZE, 1))]).astype(theano.config.floatX)


def train_batch(batch_id=1, print_every_n=1):
Esempio n. 36
0
 def dist(value):
     return switch(ge(value , 0) & le(value , 1) &
               gt(alpha , 0) & gt(beta , 0),
               gammaln(alpha+beta) - gammaln(alpha) - gammaln(beta) + (alpha- 1)*log(value) + (beta-1)*log(1-value),
               -inf)
Esempio n. 37
0
def elu(X):
    return T.switch(T.ge(X, 0), X, T.exp(X) - 1.)
    def __init__(self, model, state, data):
        """
        Parameters:
            :param model:
                Class describing the model used. It should provide the
                 computational graph to evaluate the model, and have a
                 similar structure to classes on the models folder
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        if 'adarho' not in state:
            state['adarho'] = 0.96
        if 'adaeps' not in state:
            state['adaeps'] = 1e-6

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        bs = state['bs']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))
        self.gs = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name) for p in model.params
        ]
        self.gnorm2 = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name + '_g2') for p in model.params
        ]
        self.dnorm2 = [
            theano.shared(numpy.zeros(p.get_value(borrow=True).shape,
                                      dtype=theano.config.floatX),
                          name=p.name + '_d2') for p in model.params
        ]

        self.step = 0
        self.whole_time = 0.0
        self.bs = bs
        self.state = state
        self.data = data
        self.step_timer = time.time()
        self.gdata = [
            theano.shared(numpy.zeros((2, ) * x.ndim, dtype=x.dtype),
                          name=x.name) for x in model.inputs
        ]
        #training dataset stored in gpu. They are defined as shared variables from the
        #'inputs' variable in the encoder-decoder model.
        if 'profile' not in self.state:
            self.state['profile'] = 0

        ###################################
        # Step 1. Compile training function
        ###################################
        logger.debug('Constructing grad function')
        loc_data = self.gdata
        self.prop_exprs = [x[1] for x in model.properties]
        self.prop_names = [x[0] for x in model.properties]
        self.update_rules = [x[1] for x in model.updates]
        rval = theano.clone(model.param_grads + self.update_rules + \
                            self.prop_exprs + [model.train_cost],
                            replace={k:v for k, v in zip(model.inputs, loc_data)})
        nparams = len(model.params)
        nouts = len(self.prop_exprs)
        nrules = len(self.update_rules)
        gs = rval[:nparams]
        rules = rval[nparams:nparams + nrules]
        outs = rval[nparams + nrules:]

        norm_gs = TT.sqrt(
            sum(
                TT.sum(x**2) for x, p in zip(gs, self.model.params)
                if p not in self.model.exclude_params_for_norm))
        if 'cutoff' in state and state['cutoff'] > 0:
            c = numpy.float32(state['cutoff'])
            if state['cutoff_rescale_length']:
                c = c * TT.cast(loc_data[0].shape[0], 'float32')

            notfinite = TT.or_(TT.isnan(norm_gs), TT.isinf(norm_gs))
            _gs = []
            for g, p in zip(gs, self.model.params):
                if p not in self.model.exclude_params_for_norm:
                    tmpg = TT.switch(TT.ge(norm_gs, c), g * c / norm_gs, g)
                    _gs.append(
                        TT.switch(notfinite,
                                  numpy.float32(.1) * p, tmpg))
                else:
                    _gs.append(g)
            gs = _gs
        store_gs = [(s, g) for s, g in zip(self.gs, gs)]
        updates = store_gs + [(s[0], r) for s, r in zip(model.updates, rules)]

        rho = self.state['adarho']
        eps = self.state['adaeps']

        # grad2
        gnorm2_up = [
            rho * gn2 + (1. - rho) * (g**2.)
            for gn2, g in zip(self.gnorm2, gs)
        ]
        updates = updates + [(gn1, gn2)
                             for gn1, gn2 in zip(self.gnorm2, gnorm2_up)]

        logger.debug('Compiling grad function')
        st = time.time()
        self.train_fn = theano.function([],
                                        outs,
                                        name='train_function',
                                        updates=updates,
                                        givens=zip(model.inputs, loc_data))
        logger.debug('took {}'.format(time.time() - st))

        self.lr = numpy.float32(1.)
        new_params = [
            p - (TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g for p, g, gn2,
            dn2 in zip(model.params, self.gs, self.gnorm2, self.dnorm2)
        ]

        updates = [(a, b) for a, b in zip(model.params, new_params)]
        # d2
        d2_up = [(dn2, rho * dn2 + (1. - rho) *
                  (((TT.sqrt(dn2 + eps) / TT.sqrt(gn2 + eps)) * g)**2.))
                 for dn2, gn2, g in zip(self.dnorm2, self.gnorm2, self.gs)]
        updates = updates + d2_up

        self.update_fn = theano.function([], [],
                                         name='update_function',
                                         allow_input_downcast=True,
                                         updates=updates)

        self.old_cost = 1e20
        self.schedules = model.get_schedules()
        self.return_names = self.prop_names + \
                ['cost',
                'error',
                'time_step',
                'whole_time', 'lr']
        self.prev_batch = None
Esempio n. 39
0
def SignTheano(x):
    return T.cast(2.*T.ge(x,0)-1., theano.config.floatX)
Esempio n. 40
0
    def build(self):
        # local graph context
        g_sym = T.imatrix('g')  # a pair of node index (an edge)
        gy_sym = T.vector(
            'gy')  # label of a pair (indicating whether it is a false edge)
        l_g_in = lasagne.layers.InputLayer(shape=(None, 2), input_var=g_sym)
        # l_gy_in = lasagne.layers.InputLayer(shape=(None,), input_var=gy_sym)
        # embedding of node i (pivot node)
        l_emb_local_i = lasagne.layers.SliceLayer(l_g_in, indices=0, axis=1)
        l_emb_local_i = lasagne.layers.EmbeddingLayer(
            l_emb_local_i,
            input_size=self.num_nodes,
            output_size=self.embedding_dim)
        # embedding of node j (context node)
        l_emb_local_j = lasagne.layers.SliceLayer(l_g_in, indices=1, axis=1)
        l_emb_local_j = lasagne.layers.EmbeddingLayer(
            l_emb_local_j,
            input_size=self.num_nodes,
            output_size=self.embedding_dim)
        l_gy = lasagne.layers.ElemwiseMergeLayer(
            [l_emb_local_i, l_emb_local_j], T.mul)
        pgy_sym = lasagne.layers.get_output(l_gy)
        g_loss = -T.log(T.nnet.sigmoid(T.sum(pgy_sym, axis=1) * gy_sym)).sum()
        g_params = lasagne.layers.get_all_params(l_gy, trainable=True)
        g_updates = lasagne.updates.sgd(g_loss,
                                        g_params,
                                        learning_rate=self.g_learning_rate)
        self.graph_fn = theano.function([g_sym, gy_sym],
                                        g_loss,
                                        updates=g_updates,
                                        on_unused_input='warn')

        self.embedding = l_emb_local_i.W

        # local attributes
        ind_sym = T.ivector('ind')
        l_ind_in = lasagne.layers.InputLayer(shape=(None, ), input_var=ind_sym)
        # embedding of current node
        l_emb_f = lasagne.layers.EmbeddingLayer(l_ind_in,
                                                input_size=self.num_nodes,
                                                output_size=self.embedding_dim,
                                                W=self.embedding)
        x_sym = {}
        y_sym = T.vector('y')
        l_x_in = {}
        l_x_hid = {}
        attr_loss = {}
        for n in self.schema["nodes"]:
            x_sym[n] = sparse.csr_matrix(n, dtype='float32')
            l_x_in[n] = lasagne.layers.InputLayer(
                shape=(None, self.schema["nodes"][n]), input_var=x_sym[n])
            l_x_hid[n] = layers.SparseLayer(l_x_in[n], self.embedding_dim)
            l_ay = lasagne.layers.ElemwiseMergeLayer([l_x_hid[n], l_emb_f],
                                                     T.mul)
            pay_sym = lasagne.layers.get_output(l_ay)
            attr_loss[n] = -T.log(
                T.nnet.sigmoid(T.sum(pay_sym, axis=1) * y_sym)).sum()
            attr_params = lasagne.layers.get_all_params(l_ay, trainable=True)
            attr_updates = lasagne.updates.sgd(
                attr_loss[n], attr_params, learning_rate=self.g_learning_rate)
            self.attr_fn[n] = theano.function([x_sym[n], y_sym, ind_sym],
                                              attr_loss[n],
                                              updates=attr_updates,
                                              on_unused_input='warn')

        # alignment
        anchor_sym = T.imatrix('anchor')
        anchor_y_sym = T.vector('anchor_y')
        l_a_in = lasagne.layers.InputLayer(shape=(None, 2),
                                           input_var=anchor_sym)
        l_emb_anchor_i = lasagne.layers.SliceLayer(l_a_in, indices=0, axis=1)
        l_emb_anchor_i = lasagne.layers.EmbeddingLayer(
            l_emb_anchor_i,
            input_size=self.num_nodes,
            output_size=self.embedding_dim,
            W=self.embedding)
        l_emb_anchor_j = lasagne.layers.SliceLayer(l_a_in, indices=1, axis=1)
        l_emb_anchor_j = lasagne.layers.EmbeddingLayer(
            l_emb_anchor_j,
            input_size=self.num_nodes,
            output_size=self.embedding_dim,
            W=self.embedding)
        l_anchor_y = lasagne.layers.ElemwiseMergeLayer(
            [l_emb_anchor_i, l_emb_anchor_j], T.mul)
        p_anchor_y_sym = lasagne.layers.get_output(l_anchor_y)
        anchor_loss = -T.log(
            T.nnet.sigmoid(
                T.sum(p_anchor_y_sym, axis=1) * anchor_y_sym)).sum()
        anchor_params = lasagne.layers.get_all_params(l_anchor_y,
                                                      trainable=True)
        anchor_updates = lasagne.updates.sgd(
            anchor_loss, anchor_params, learning_rate=self.g_learning_rate)
        self.anchor_fn = theano.function([anchor_sym, anchor_y_sym],
                                         anchor_loss,
                                         updates=anchor_updates,
                                         on_unused_input='warn')

        tp_anchor_y_sym = lasagne.layers.get_output(l_anchor_y,
                                                    deterministic=True)
        tp_anchor_y_sym = T.sum(tp_anchor_y_sym, axis=1)
        acc = T.mean(T.eq(T.ge(tp_anchor_y_sym, 0), anchor_y_sym))
        self.test_fn = theano.function([anchor_sym, anchor_y_sym],
                                       acc,
                                       on_unused_input='warn')
Esempio n. 41
0
 def __call__(self, p):
     p = theano.shared(p)
     p *= T.ge(p, 0.)
     return p
Esempio n. 42
0
def SGMGNHT_2(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, resamp = 50, clip_norm=1):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    #rng = np.random.RandomState(3435)
    #+ rng.normal(0,1,p0.shape())
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. +1e-1, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 10.0, name='%s_xi'%k) 
    
    #a = theano.shared(numpy_floatX(2.))
    # m = theano.shared(numpy_floatX(1.))
    # c = theano.shared(numpy_floatX(1.))
    # sigma_p = theano.shared(numpy_floatX(10.))
    # sigma_xi = theano.shared(numpy_floatX(0.01))
    # sigma_theta = theano.shared(numpy_floatX(0.1))
    # gamma = theano.shared(numpy_floatX(1.))
    
    m = theano.shared(numpy_floatX(1.))
    c = theano.shared(numpy_floatX(3.))
    sigma_p = theano.shared(numpy_floatX(0.01))
    sigma_mom = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(0.01))
    gamma = theano.shared(numpy_floatX(1.0))
    
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a = 1, m {} c {} s_p{} s_mom{} s_xi{} g_xi{}'.format( m.get_value(), c.get_value(), sigma_p.get_value(), sigma_mom.get_value(), sigma_xi.get_value(), gamma.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    
    # clip norm
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
        
        g_f = (tensor.sqrt(tensor.abs_(mom+1e-100)))/m
        K_f = g_f + 4/c/(1 + tensor.exp(c*g_f))
        
        psi_f_1 = -1 + 2/( 1 + tensor.exp(-c*g_f))
        f1_f_1 = 1/2.0/m**2 *psi_f_1**2 /g_f*tensor.sgn(mom)
        #f1_f_1 = 1/2.0/m*psi_f_1**2* tensor.abs_(mom+1e-100)**(-1/2) *tensor.sgn(mom)
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  f1_f_1**2 - 1/2.0/m**2 * psi_f_1 * psi_grad_f_1 / tensor.abs_(mom) + 1/4.0/m * psi_f_1**2 * (tensor.abs_(mom+1e-100)**(-1.5))
        
        
#        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
#        f1_f = 1/2/m*psi_f**2 * (tensor.abs_(mom+1e-100)**(-1/2))*tensor.sgn(mom)
#        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
#        f3_f =  f1_f**2 - c/2/m**2 * psi_f * psi_grad_f / tensor.abs_(mom) + 1/4/m * psi_f**2 * (tensor.abs_(mom+1e-100)**(-3/2))
 
#        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
#        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       

        temp_f1 = f1_f_1
        temp_f3 = f3_f_1
        
        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_mom = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')

        # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2)
        noise_temp = tensor.zeros(p.get_value().shape)
        for aa in xrange(4):
            this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')
            noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2)
        randmg = (noise_temp*m/2)**2*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32'))    

        updated_p = p +  temp_f1 * lr - g * lr * ntrain * sigma_p + tensor.sqrt(2*sigma_p*lr) * noise_p
        updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_mom*lr) * noise_mom)* (1-tensor.eq(tensor.mod(iterations,resamp),0)) + randmg * tensor.eq(tensor.mod(iterations,resamp),0)
        #updated_mom = mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p
        temp_xi = trng.normal(p.get_value().shape, avg = sigma_mom, std = tensor.sqrt(sigma_xi/2) , dtype='float32')
        updated_xi = (xi + temp_f3* gamma * lr - (xi - sigma_mom)*sigma_xi/(gamma+1e-10)*lr + tensor.sqrt(2*sigma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,resamp),resamp/2)) + temp_xi * tensor.eq(tensor.mod(iterations,resamp),resamp/2)

        updates.append((p, updated_p))
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates)
    #f_params = theano.function([], [a, m, c, mom.shape])
    return f_grad_shared, f_update
Esempio n. 43
0
def clip_norm(g, c, n):
    if c > 0:
        g = T.switch(T.ge(n, c), g * c / n, g)
    return g
 def error(self, y, threshold=0.5):
     return tensor.mean(
         tensor.eq(tensor.ge(self.prediction(), threshold), y))
Esempio n. 45
0
 def dist(value):
     return switch (ge(value , 0) & ge(n , value) & ge(p , 0) & le(p , 1),
                switch(ne(value , 0) , value*log(p), 0) + (n-value)*log(1-p) + factln(n)-factln(value)-factln(n-value),
                -inf)
Esempio n. 46
0
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
        """Return the shape of the output from this op, for input of given
        shape and flags.

        Parameters
        ----------
        imgshape : tuple of integers or scalar Theano variables
            the shape of a tensor of images. The last two elements are
            interpreted as the number of rows, and the number of cols.
        ds : tuple of two ints
            downsample factor over rows and columns this parameter
            indicates the size of the pooling region
        st : tuple of two ints
            the stride size. This is the distance between the pooling
            regions. If it's set to None, in which case it equlas ds.
        ignore_border : bool
            if ds doesn't divide imgshape, do we include an extra
            row/col of partial downsampling (False) or ignore it
            (True).
        padding : tuple of two ints
            (pad_h, pad_w), pad zeros to extend beyond four borders of
            the images, pad_h is the size of the top and bottom
            margins, and pad_w is the size of the left and right
            margins.

        Returns
        -------
        list :
            the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but
            with last two elements reduced as per the downsampling &
            ignore_border flags.

        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '
                            '(rows, cols)')

        if st is None:
            st = ds
        r, c = imgshape[-2:]
        r += padding[0] * 2
        c += padding[1] * 2

        if ignore_border:
            out_r = (r - ds[0]) // st[0] + 1
            out_c = (c - ds[1]) // st[1] + 1
            if isinstance(r, theano.Variable):
                nr = tensor.maximum(out_r, 0)
            else:
                nr = numpy.maximum(out_r, 0)
            if isinstance(c, theano.Variable):
                nc = tensor.maximum(out_c, 0)
            else:
                nc = numpy.maximum(out_c, 0)
        else:
            if isinstance(r, theano.Variable):
                nr = tensor.switch(
                    tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1,
                    tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1)
            elif st[0] >= ds[0]:
                nr = (r - 1) // st[0] + 1
            else:
                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1

            if isinstance(c, theano.Variable):
                nc = tensor.switch(
                    tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1,
                    tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1)
            elif st[1] >= ds[1]:
                nc = (c - 1) // st[1] + 1
            else:
                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1

        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
Esempio n. 47
0
def SGMGHMC_old(tparams, cost, inps, ntrain, lr, iterations, rho=0.9, epsilon=1e-6, a_i = 2, clip_norm=5):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) 
    
    a = theano.shared(numpy_floatX(1.))
    m = theano.shared(numpy_floatX(1.))
    c = theano.shared(numpy_floatX(5.))
    sigma_p = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(1.))
    gamma_xi = theano.shared(numpy_floatX(0.001))
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
        
        g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a))
        K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f)))
        
        psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) )
        f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
        
        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
        f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
        f3_f =  1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
 
        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       


        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2)
        noise_temp = tensor.zeros(p.get_value().shape)
        for aa in xrange(a_i*2):
            this_noise = trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32')
            noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2)
        randmg = (noise_temp*m/2)**a*tensor.sgn(trng.normal(p.get_value().shape, avg = 0.0, std = 1., dtype='float32'))    

        
        updated_p = p +  temp_f1 * lr
	updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0)
        #updated_mom = mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p
        temp_xi = trng.normal(p.get_value().shape, avg = sigma_p, std = tensor.sqrt(sigma_xi/2) , dtype='float32')
        updated_xi = (xi + temp_f3* sigma_xi * lr - (xi - sigma_p)*gamma_xi*lr + tensor.sqrt(2*sigma_xi*gamma_xi*lr) * noise_xi) * (1-tensor.eq(tensor.mod(iterations,100),50)) + temp_xi * tensor.eq(tensor.mod(iterations,100),50)
        

        updates.append((p, updated_p))
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain,iterations], [p,mom,xi], updates=updates)
    #f_params = theano.function([], [a, m, c, mom.shape])
    return f_grad_shared, f_update
Esempio n. 48
0
def huber_loss(y_hat, target, delta=1, center=0, std=1):

    l1_diff = abs((target - center - y_hat) / std)
    huber_loss = TT.switch(TT.ge(l1_diff, delta), (2 * l1_diff - 1) * delta,
                           l1_diff**2)
    return huber_loss
Esempio n. 49
0
def snelu(X):
    scale = 1.0507009873554804934193349852946
    alpha = 1.6732632423543772848170429916717
    return scale * T.switch(T.ge(X, 0), X, alpha * T.exp(X) - alpha)
Esempio n. 50
0
    def apply(self, application_call, words, mask):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.

        """
        word_ids = self._word_to_id(words)

        # shortlisting
        input_word_ids = (tensor.lt(word_ids, self._num_input_words) * word_ids
                          + tensor.ge(word_ids, self._num_input_words) * self._vocab.unk)
        output_word_ids = (tensor.lt(word_ids, self._num_output_words) * word_ids
                          + tensor.ge(word_ids, self._num_output_words) * self._vocab.unk)

        application_call.add_auxiliary_variable(
            unk_ratio(input_word_ids, mask, self._vocab.unk),
            name='unk_ratio')

        # Run the main rnn with combined inputs
        rnn_inputs = self._main_lookup.apply(input_word_ids)

        encoder_rnn_states = self._encoder_rnn.apply(
            tensor.transpose(self._encoder_fork.apply(rnn_inputs), (1, 0, 2)),
            mask=mask.T)[0]

        # The first token is not predicted
        logits = self._pre_softmax.apply(main_rnn_states[:-1])
        targets = output_word_ids.T[1:]
        out_softmax = self._softmax.apply(logits, extra_ndim=1)
        application_call.add_auxiliary_variable(
                out_softmax.copy(), name="proba_out")
        minus_logs = self._softmax.categorical_cross_entropy(
            targets, logits, extra_ndim=1)

        targets_mask = mask.T[1:]
        costs = self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask,
                               "perplexity")

        missing_embs = tensor.eq(input_word_ids, self._vocab.unk).astype('int32') # (bs, L)
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * missing_embs.T[:-1],
                               "perplexity_after_mis_word_embs")
        self.add_perplexity_measure(application_call, minus_logs,
                               targets_mask * (1-missing_embs.T[:-1]),
                               "perplexity_after_word_embs")

        word_counts = self._word_to_count(words)
        very_rare_masks = []
        for threshold in self._very_rare_threshold:
            very_rare_mask = tensor.lt(word_counts, threshold).astype('int32')
            very_rare_mask = targets_mask * (very_rare_mask.T[:-1])
            very_rare_masks.append(very_rare_mask)
            self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask,
                                   "perplexity_after_very_rare_" + str(threshold))

        if self._retrieval:
            has_def = tensor.zeros_like(output_word_ids)
            has_def = tensor.inc_subtensor(has_def[def_map[:,0], def_map[:,1]], 1)
            mask_targets_has_def = has_def.T[:-1] * targets_mask # (L-1, bs)
            self.add_perplexity_measure(application_call, minus_logs,
                                   mask_targets_has_def,
                                   "perplexity_after_def_embs")

            for thresh, very_rare_mask in zip(self._very_rare_threshold, very_rare_masks):
                self.add_perplexity_measure(application_call, minus_logs,
                                   very_rare_mask * mask_targets_has_def,
                                   "perplexity_after_def_very_rare_" + str(thresh))

            application_call.add_auxiliary_variable(
                    mask_targets_has_def.T, name='mask_def_emb')

        return costs, updates
Esempio n. 51
0
 def leaky(self, X):
     return T.switch(T.ge(X, 0), X, self.leak * X)
Esempio n. 52
0
        gsums = [
            theano.shared(np.zeros_like(param.get_value(borrow=True)))
            for param in net.params
        ]

    cost = net.cost(y) + L2_REG * net.L2_sqr

    gparams = T.grad(cost, net.params)
    updates = OrderedDict()

    # Compute norm of gradients
    norm = T.sqrt(T.sum([T.sum(gparam**2) for gparam in gparams]))

    # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011)
    for gparam, param, gsum in zip(gparams, net.params, gsums):
        gparam = T.switch(T.ge(norm, CLIPPING_THRESHOLD),
                          gparam / norm * CLIPPING_THRESHOLD,
                          gparam)  # Clipping of gradients
        updates[gsum] = gsum + (gparam**2)
        updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6)))

    train_model = theano.function(inputs=[x, p, y, lr],
                                  outputs=cost,
                                  updates=updates)

    validate_model = theano.function(inputs=[x, p, y], outputs=net.cost(y))

    print("Training...")
    for epoch in range(starting_epoch, MAX_EPOCHS):
        t0 = time()
        total_neg_log_likelihood = 0
Esempio n. 53
0
def SGMGHMC_p(tparams, cost, inps, ntrain, lr, rho=0.9, epsilon=1e-6, clip_norm=0.1):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_mom'%k) 
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10, name='%s_xi'%k) 
    
    a = theano.shared(numpy_floatX(2.))
    m_org = theano.shared(numpy_floatX(5.))
    c = theano.shared(numpy_floatX(5.))
    sigma_p = theano.shared(numpy_floatX(10.))
    sigma_xi = theano.shared(numpy_floatX(0.001))
    gamma_xi = theano.shared(numpy_floatX(1))
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a {} m {} c {} s_p{} s_xi{} g_xi{}'.format(a.get_value(), m_org.get_value(), c.get_value(), sigma_p.get_value(), sigma_xi.get_value(), gamma_xi.get_value()))
    
    p = tensor.vector('p', dtype='float32')
    
    """ default: lr=0.001 """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
        
    gshared = [theano.shared(p0.get_value() * 0., name='%s_grad'%k) 
                for k, p0 in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)     
    
    updates = []
    # reset mom
    # counter = theano.shared(numpy_floatX(0.))
    # updates.append((counter,counter+1))
 
    for p, mom, xi, g in zip(tparams.values(),mom_tparams.values(),xi_tparams.values(), gshared):
	#rms prop
        t = theano.shared(p.get_value() * 0.)
        t_new = rho * t + (1-rho) * g**2
        updates.append((t, t_new))

        m = (tensor.sqrt(t_new) + 1e-10)
        m = m/tensor.max(m)*m_org
        #m = tensor.switch(tensor.ge(m,1*m_org), 1*m_org, m)
        m = tensor.switch(tensor.le(m,m_org*0.01), m_org*0.01, m)
        
        g_f = tensor.sgn(mom)/m*(tensor.abs_(mom)**(1/a))
        K_f = -g_f + 2/c*(c*g_f + tensor.log(1+tensor.exp(-c*g_f)))
        
        psi_f_1 = (1- tensor.exp(-c*g_f) )/( 1 + tensor.exp(-c*g_f) )
        f1_f_1 = 1/m/a*psi_f_1*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f_1 = 2*c*tensor.exp(- c*g_f)/(1 + tensor.exp(-c*g_f))**2
        f3_f_1 =  1/m**2/a**2*(psi_f_1**2-psi_grad_f_1)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f_1*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
        
        psi_f = (tensor.exp(c*g_f) - 1)/(tensor.exp(c*g_f) + 1)
        f1_f = 1/m/a*psi_f*(tensor.abs_(mom+1e-100)**(1/a-1))
        psi_grad_f = 2*c*tensor.exp(c*g_f)/(tensor.exp(c*g_f) + 1)**2
        f3_f =  1/m**2/a**2*(psi_f**2-psi_grad_f)*tensor.abs_(mom+1e-100)**(2/a-2) - (1/a-1)/m/a*psi_f*tensor.sgn(mom)*tensor.abs_(mom+1e-100)**(1/a-2)
 
        temp_f1 = tensor.switch(tensor.ge(g_f,0), f1_f_1, f1_f)
        temp_f3 = tensor.switch(tensor.ge(g_f,0), f3_f_1, f3_f)       


        noise_p = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        noise_xi = trng.normal(p.get_value().shape, avg = 0.0, std = 1., 
                          dtype='float32')
        
        #lr_new = 1 / tensor.sqrt(tensor.abs_(temp_f1)) * lr 
        lr_new = lr
        updated_p = p +  temp_f1 * lr_new
        #updated_mom = (mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p)* (1-tensor.eq(tensor.mod(iterations,100),0)) + randmg * tensor.eq(tensor.mod(iterations,100),0)
        updated_mom = mom - 1.2*temp_f1* xi *lr_new  - g * lr_new * ntrain + tensor.sqrt(2*sigma_p*lr_new) * noise_p
        updated_xi = xi + temp_f3* sigma_xi * lr_new - (xi - sigma_p)*gamma_xi*lr_new + tensor.sqrt(2*sigma_xi*gamma_xi*lr_new) * noise_xi 
        

        updates.append((p, updated_p))
            
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))
    
    f_update = theano.function([lr,ntrain], [p,mom,m], updates=updates)
    
    return f_grad_shared, f_update
Esempio n. 54
0
 def elu(self, X):
     return T.switch(T.ge(X, 0), X, self.elu_param * (T.exp(X) - 1))
Esempio n. 55
0
def ge(x, y):
    return T.ge(x, y)
Esempio n. 56
0
    def RMSprop(self,
                cost,
                params,
                full_params,
                sampled_params,
                sidxs,
                epsilon=1e-6):
        grads = [T.grad(cost=cost, wrt=param) for param in params]
        sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
        updates = OrderedDict()
        if self.grad_cap > 0:
            norm = T.cast(
                T.sqrt(
                    T.sum([
                        T.sum([T.sum(g**2) for g in g_list])
                        for g_list in grads
                    ]) + T.sum([T.sum(g**2) for g in sgrads])),
                theano.config.floatX)
            grads = [[
                T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                         g) for g in g_list
            ] for g_list in grads]
            sgrads = [
                T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                         g) for g in sgrads
            ]
        for p_list, g_list in zip(params, grads):
            for p, g in zip(p_list, g_list):
                if self.adapt:
                    if self.adapt == 'adagrad':
                        g = self.adagrad(p, g, updates)
                    if self.adapt == 'rmsprop':
                        g = self.rmsprop(p, g, updates)
                    if self.adapt == 'adadelta':
                        g = self.adadelta(p, g, updates)
                    if self.adapt == 'adam':
                        g = self.adam(p, g, updates)
                if self.momentum > 0:
                    velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                             borrow=True)
                    velocity2 = self.momentum * velocity - np.float32(
                        self.learning_rate) * (g + self.lmbd * p)
                    updates[velocity] = velocity2
                    updates[p] = p + velocity2
                else:
                    updates[p] = p * np.float32(1.0 - self.learning_rate *
                                                self.lmbd) - np.float32(
                                                    self.learning_rate) * g

        fgrads = [
            T.grad(cost=cost, wrt=full_param) for full_param in full_params
        ]
        for p, g in zip(full_params, fgrads):
            if self.adapt:
                if self.adapt == 'adagrad':
                    g = self.adagrad(p, g, updates)
                if self.adapt == 'rmsprop':
                    g = self.rmsprop(p, g, updates)
                if self.adapt == 'adadelta':
                    g = self.adadelta(p, g, updates)
                if self.adapt == 'adam':
                    g = self.adam(p, g, updates)
            if self.momentum > 0:
                velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                         borrow=True)
                velocity2 = self.momentum * velocity - np.float32(
                    self.learning_rate) * (g + self.lmbd * p)
                updates[velocity] = velocity2
                updates[p] = p + velocity2
            else:
                updates[p] = p * np.float32(1.0 - self.learning_rate *
                                            self.lmbd) - np.float32(
                                                self.learning_rate) * g
        '''
        for i in range(len(sgrads)):
            g = sgrads[i]
            fullP = full_params[i]
            sample_idx = sidxs[i]
            sparam = sampled_params[i]
            if self.adapt:
                if self.adapt == 'adagrad':
                    g = self.adagrad(fullP, g, updates, sample_idx)
                if self.adapt == 'rmsprop':
                    g = self.rmsprop(fullP, g, updates, sample_idx)
                if self.adapt == 'adadelta':
                    g = self.adadelta(fullP, g, updates, sample_idx)
                if self.adapt == 'adam':
                    g = self.adam(fullP, g, updates, sample_idx)
            if self.lmbd > 0:
                delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
            else:
                delta = np.float32(self.learning_rate) * g
            if self.momentum > 0:
                velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
                vs = velocity[sample_idx]
                velocity2 = self.momentum * vs - delta
                updates[velocity] = T.set_subtensor(vs, velocity2)
                updates[fullP] = T.inc_subtensor(sparam, velocity2)
            else:
                updates[fullP] = T.inc_subtensor(sparam, - delta)
	'''
        return updates
Esempio n. 57
0
n_steps = tt.iscalar("generator/n_steps")
tau = tt.fscalar("generator/gumbel/tau")

# Generator's input variables for the Encoder
v_gen_input = tt.itensor3(name="generator/input")

# Generator's embedding subnetwork readout for the Encoder
v_gen_embed = lasagne.layers.get_output(l_embed_char, v_gen_input)

# Freeze the hidden inputs of the decoder layers, which do not tap into the encoder.
for layer in dec_rnn_layers:
    GRULayer_freeze(layer, v_gen_input)

# Readout the last state from the encoder.
inputs = {l_encoder_embed: v_gen_embed,
          l_encoder_mask: tt.ge(v_gen_input, 0)}
inputs[l_stack_aug_mask] = tt.gt(tt.sum(inputs[l_encoder_mask], axis=-1), 0)

outputs = [l.hid_init for l in dec_rnn_layers]

dec_hid_inits = lasagne.layers.get_output(outputs, inputs,
                                          deterministic=True)

# Prepare the initial values fed into the scan loop of the Generator
h_0 = tt.concatenate(dec_hid_inits, axis=-1)

x_0 = tt.fill(tt.zeros((v_gen_input.shape[0],), dtype="int32"),
              vocab.index("\x02"))
x_0 = lasagne.layers.get_output(l_embed_char, x_0)

m_0 = tt.ones((v_gen_input.shape[0],), 'bool')
Esempio n. 58
0
 def __init__(self,
              shape,
              input_var=None,
              name=None,
              binary=True,
              deterministic=False,
              threshold=0.5,
              batch_size=100,
              n_bits=-1,
              **kwargs):
     self.rng_mrg = RandomStreams(lasagne.random.get_rng().randint(
         1, 2394349593))
     if binary == False:
         if n_bits == -1:  # no quantization at all
             super(InputLayer, self).__init__(shape=shape,
                                              input_var=input_var,
                                              name=name,
                                              **kwargs)
         else:
             # Normalize to [0 ~ 1 - 2^(-n_bits)]
             input_var_normed = input_var * (1 - 2**(-n_bits))
             if deterministic == False:
                 shape_rand = list(shape)
                 if shape_rand[0] is None:
                     shape_rand[0] = batch_size
                 shape_rand = tuple(shape_rand)
                 input_var_ceil = T.ceil(
                     input_var_normed * 2**n_bits) / 2**n_bits
                 input_var_floor = T.floor(
                     input_var_normed * 2**n_bits) / 2**n_bits
                 input_var_above_floor = input_var - input_var_floor
                 input_var_stochastic_quantized = T.cast(
                     T.switch(
                         T.ge(
                             input_var_above_floor,
                             self.rng_mrg.uniform(
                                 shape_rand,
                                 low=0.0,
                                 high=2**(-n_bits),
                                 dtype=theano.config.floatX)),
                         input_var_ceil, input_var_floor),
                     theano.config.floatX)
                 super(InputLayer, self).__init__(
                     shape=shape,
                     input_var=input_var_stochastic_quantized,
                     name=name,
                     **kwargs)
             else:
                 input_var_deterministic_quantized = T.cast(
                     T.round(input_var_normed * 2**n_bits) / 2**n_bits,
                     theano.config.floatX)
                 super(InputLayer, self).__init__(
                     shape=shape,
                     input_var=input_var_deterministic_quantized,
                     name=name,
                     **kwargs)
     else:
         if deterministic == False:
             shape_rand = list(shape)
             if shape_rand[0] is None:
                 shape_rand[0] = batch_size
             shape_rand = tuple(shape_rand)
             # Bernoulli spikes
             input_var_stochastic_binarized = T.cast(
                 T.gt(
                     input_var,
                     self.rng_mrg.uniform(shape_rand,
                                          low=0.0,
                                          high=1.0,
                                          dtype=theano.config.floatX)),
                 theano.config.floatX)
             super(InputLayer,
                   self).__init__(shape=shape,
                                  input_var=input_var_stochastic_binarized,
                                  name=name,
                                  **kwargs)
         else:
             input_var_deterministic_binarized = T.cast(
                 T.switch(T.ge(input_var, threshold), 1.0, 0.),
                 theano.config.floatX)
             super(InputLayer, self).__init__(
                 shape=shape,
                 input_var=input_var_deterministic_binarized,
                 name=name,
                 **kwargs)
Esempio n. 59
0
    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1,
                      r3) for r3, r1 in zip(r3s, r1s)
        ]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, constantX(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, constantX(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds), constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, constantX(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag,
                                                           constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)), constantX(3),
                            TT.le(t2, constantX(1)), constantX(4),
                            TT.le(relrnorm, rtol), constantX(1),
                            TT.le(Anorm, constantX(1e-20)), constantX(12),
                            TT.le(relArnorml, rtol), constantX(10),
                            TT.ge(epsx, beta1), constantX(5),
                            TT.ge(xnorm, maxxnorm), constantX(6),
                            TT.ge(niter, TT.cast(maxit, theano.config.floatX)),
                            constantX(8), flag), flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))
Esempio n. 60
0
def clip_gradients(gradients, grad_clip=5., hard_clip=False):
    """
    This returns the gradient parameters clipped according to the grad_clip value given in initialization.

    As described here: http://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/

    Code mostly taken from https://github.com/kastnerkyle/minet/blob/master/minet/net.py

    Based on:

    Pascanu, Razvan, Tomas Mikolov, and Yoshua Bengio. "On the difficulty of training
            recurrent neural networks." arXiv preprint arXiv:1211.5063 (2012).

    Parameters
    ----------
    gradients : dict
        A dictionary mapping from the model's parameters to their
        gradients.
    grad_clip : float, optional
        How much to clip gradients (if at all).
    hard_clip : bool
        Whether to use hard clipping (keeping gradients at grad_clip level), or soft clipping (rescaling based
        on grad_clip).

    Returns
    -------
    clipgrads : dict
        A dictionary mapping from the model's parameters to their correctly clipped
        gradients. (If no self.grad_clip, this just returns the original `gradients` input parameter).
    """
    if grad_clip:
        gradients = gradients.items()
        params = [item[0] for item in gradients]
        grads = [item[1] for item in gradients]

        # Gradient clipping
        grad_norm = T.sqrt(sum([T.sqr(grad).sum() for grad in grads]))
        not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
        grad_norm = T.sqrt(grad_norm)
        scaling_num = grad_clip
        scaling_den = T.maximum(grad_clip, grad_norm)

        if hard_clip:
            # do the NaN/inf trick
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad)
                     for param, grad in gradients]
            # hard clip gradients above or below grad_clip to be = grad_clip
            grads = [T.switch(T.ge(grad_norm, grad_clip),
                              T.sgn(grad) * grad_clip,
                              grad)
                     for grad in grads]
        else:
            # NaN/inf trick combined with scaling.
            grads = [T.switch(not_finite,
                              0.1 * param,
                              grad * (scaling_num / scaling_den))
                     for param, grad in gradients]

        clipgrads = OrderedDict(zip(params, grads))
        return clipgrads
    else:
        return gradients