Esempio n. 1
0
def gradients(cost, parameters, lr=0.001):

    updates = []

    c = 0
    for param in parameters:

        update = param - lr * theano.grad(cost, param)

        if c == 1 or c == 3:

            # update = t.minimum(t.abs_(update), np.pi) * (update / abs(update))
            #
            # update = t.maximum(update, 0)
            # update = t.minimum(update, np.pi)

            update = ifelse(t.lt(update, 0), np.pi * 2 - 0.001, update)
            update = ifelse(t.gt(update, np.pi * 2), 0.001, update)

        if c == 2:

            update = ifelse(t.lt(update, 2), float(20), update)

        elif c == 5 or c == 6:

            update = t.maximum(update, -5)
            update = t.minimum(update, 5)

        updates.append((param, update))

        c += 1

    return updates
Esempio n. 2
0
def theano_digitize(x, bins):
    """
    Equivalent to numpy digitize.

    Parameters
    ----------
    x : Theano tensor or array_like
        The array or matrix to be digitized
    bins : array_like
        The bins with which x should be digitized

    Returns
    -------
    A Theano tensor
        The indices of the bins to which each value in input array belongs.
    """
    binned = T.zeros_like(x) + len(bins)
    for i in range(len(bins)):
        bin=bins[i]
        if i == 0:
            binned=T.switch(T.lt(x,bin),i,binned)
        else:
            ineq = T.and_(T.ge(x,bins[i-1]),T.lt(x,bin))
            binned=T.switch(ineq,i,binned)
    binned=T.switch(T.isnan(x), len(bins), binned)
    return binned
Esempio n. 3
0
def irprop_minus_updates(params, grads):

    # IRPROP- parameters
    updates = []
    deltas = 0.1*numpy.ones(len(params))
    last_params = params
    
    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 50.
    minStep = math.exp(-6)

    for param, gparam, delta, last_gparam in zip(params, grads, deltas, last_params):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0) :
            delta = T.minimum(delta * positiveStep, maxStep)
            
            if T.lt(delta, minStep):
                delta = minStep
                
        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)
            
            if T.gt(delta, params['maxStep']):
                delta = params['maxStep']
            last_gparam = 0
            
        # update the weights
        updates.append((param, param - T.sgn(gparam) * delta))
        # store old change
        last_gparam = gparam

    return updates
Esempio n. 4
0
def _backward_negative_z(inputs, weights, normed_relevances, bias=None):
    inputs_plus = inputs * T.gt(inputs, 0)
    weights_plus = weights * T.gt(weights, 0)
    inputs_minus = inputs * T.lt(inputs, 0)
    weights_minus = weights * T.lt(weights, 0)
    # Compute weights+ * inputs- and weights- * inputs+
    negative_part_a = conv2d(
        normed_relevances, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full"
    )
    negative_part_a *= inputs_minus
    negative_part_b = conv2d(
        normed_relevances, weights_minus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full"
    )
    negative_part_b *= inputs_plus

    together = negative_part_a + negative_part_b
    if bias is not None:
        bias_negative = bias * T.lt(bias, 0)
        bias_relevance = bias_negative.dimshuffle("x", 0, "x", "x") * normed_relevances
        # Divide bias by weight size before convolving back
        # mean across channel, 0, 1 dims (hope this is correct?)
        fraction_bias = bias_relevance / T.prod(weights.shape[1:]).astype(theano.config.floatX)
        bias_rel_in = conv2d(
            fraction_bias, T.ones_like(weights).dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full"
        )
        together += bias_rel_in
    return together
Esempio n. 5
0
	def build_update(self, alpha=0.01, beta=0.0):
		W = self.W
		lambda_mult=self.lambda_mult
		y=self.y
		C = self.C
		lower_bound = theano.shared(np.float32(0.0))
		
		updates = build_gradDescent_step(W, lambda_mult, alpha,beta)
		updatelambda_mult = updates[1]  # \Longleftrightarrow  <<===>> \lambda_i'(t+1)
		
		updatelambda_mult = updatelambda_mult - T.dot(y,updatelambda_mult)/T.dot(y,y) * y 	# Longleftrightarrow <<===>> \lambda_i''(t+1)
		
		# use theano.tensor.switch because we need an elementwise comparison 
		# if \lambda_I''(t+1)> C, C
		updatelambda_mult = T.switch( T.lt( C , updatelambda_mult), C, updatelambda_mult)
		updatelambda_mult = T.switch( T.lt( updatelambda_mult,lower_bound), lower_bound, updatelambda_mult)
		
		updatelambda_mult = sandbox.cuda.basic_ops.gpu_from_host( updatelambda_mult)

		updatefunction = theano.function(inputs=[], 
										outputs = W,
										updates=[(lambda_mult, updatelambda_mult)])

		self._update_lambda_mult_graph = updatelambda_mult
		self.update_function = updatefunction

		return updatelambda_mult, updatefunction
Esempio n. 6
0
 def __init__(self, x, lower, upper, *args, **kwargs):
     super(Uniform, self).__init__(*args, **kwargs)
     self._logp = T.log(T.switch(T.gt(x, upper), 0, T.switch(T.lt(x, lower), 0, 1/(upper - lower))))
     self._cdf = T.switch(T.gt(x, up), 1, T.switch(T.lt(x, low), 0, (x - low)/(up - low)))
     self._add_expr('x', x)
     self._add_expr('lower', lower)
     self._add_expr('upper', upper)
Esempio n. 7
0
File: rae.py Progetto: zomux/nlpy
    def _recursive_step(self, i, regs, tokens, seqs, back_routes, back_lens):
        seq = seqs[i]
        # Encoding
        left, right, target = seq[0], seq[1], seq[2]

        left_rep = ifelse(T.lt(left, 0), tokens[-left], regs[left])
        right_rep = ifelse(T.lt(right, 0), tokens[-right], regs[right])

        rep = self._encode_computation(left_rep, right_rep)

        if self.deep:
            inter_rep = rep
            rep = self._deep_encode(inter_rep)
        else:
            inter_rep = T.constant(0)


        new_regs = T.set_subtensor(regs[target], rep)

        back_len = back_lens[i]

        back_reps, lefts, rights = self._unfold(back_routes[i], new_regs, back_len)
        gf_W_d1, gf_W_d2, gf_B_d1, gf_B_d2, distance, rep_gradient = self._unfold_gradients(back_reps, lefts, rights, back_routes[i],
                                                                    tokens, back_len)

        return ([rep, inter_rep, left_rep, right_rep, new_regs, rep_gradient, distance],
                self.decode_optimizer.setup([self.W_d1, self.W_d2, self.B_d1, self.B_d2],
                                    [gf_W_d1, gf_W_d2, gf_B_d1, gf_B_d2], method=self.optimization, beta=self.beta))
Esempio n. 8
0
def tnormal_icdf(size, avg, std, lbound, ubound, theano_rng, dtype):
    """
    Alternative Method:
    sample = -Phi_inv(Phi(-lbound)*(1-u) + Phi(-ubound)*u)
    """

    def Phi(x):
        erfarg = (x - avg) / (std * SQRT2)
        rval = 0.5 * (1. + T.erf(erfarg))
        return rval.astype(dtype)
    
    def Phi_inv(y, eps=3e-8):
        """ eps was calibrated for cublas.erfinv using float32 """
        temp = 2. * y - 1.
        erfinv_input = T.clip(temp, -1+eps, 1-eps)
        rval = avg + std * SQRT2 * T.erfinv(erfinv_input)
        return rval.astype(dtype)

    # center lower and upper bounds based on mean
    u = theano_rng.uniform(size=size, dtype=dtype)

    # Inverse CDF method. When method becomes numerically unstable, we simply
    # return the bounds based on whether avg < lbound, or ubound < avg.
    cdf_range = Phi(ubound) - Phi(lbound)
    sample = T.switch(
                T.or_(
                    T.lt(cdf_range, 3e-8),
                    T.gt(cdf_range, 1-3e-8)),
                T.switch(
                    T.lt(avg, lbound),
                    lbound,
                    ubound),
                Phi_inv(Phi(lbound) + u * cdf_range))

    return sample
Esempio n. 9
0
def generate_subpop_input(r_E, r_I, n_pairs):
    
    c = T.scalar("c", dtype='float32')
    h = T.matrix("h", dtype='float32')
    W_EE = T.tensor3("W_EE", dtype='float32')
    W_EI = T.tensor3("W_EI", dtype='float32')
    W_IE = T.tensor3("W_IE", dtype='float32')
    W_II = T.tensor3("W_II", dtype='float32')

    r_e = T.matrix("r_e", dtype='float32')
    r_i = T.matrix("r_i", dtype='float32')

    I_E = T.matrix('I_E', dtype='float32')
    I_I = T.matrix('I_I', dtype='float32')

    I_thresh_E = T.matrix('I_thresh_E', dtype='float32')
    I_thresh_I = T.matrix('I_thresh_I', dtype='float32')

    # Compile functions:
    I_E = c*h + T.sum(T.sum(W_EE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_EI*r_i,1),1).reshape((n_pairs, n_pairs)).T
    I_I = c*h + T.sum(T.sum(W_IE*r_e,1),1).reshape((n_pairs, n_pairs)).T - T.sum(T.sum(W_II*r_i,1),1).reshape((n_pairs, n_pairs)).T

    I_thresh_E = T.switch(T.lt(I_E,0), 0, I_E)
    I_thresh_I = T.switch(T.lt(I_I,0), 0, I_I)

    inputs = theano.function(inputs=[c,h,W_EE,W_EI,W_IE,W_II],
                                outputs=[I_thresh_E, I_thresh_I],
                                givens={r_e:r_E, r_i:r_I},
                                allow_input_downcast=True)
    return inputs
Esempio n. 10
0
File: uniform.py Progetto: ibab/carl
    def __init__(self, random_state=None, low=0.0, high=1.0):
        super(Uniform, self).__init__(low=low, high=high,
                                      random_state=random_state,
                                      optimizer=None)

        # pdf
        self.pdf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            0.,
            1. / (self.high - self.low)).ravel()
        self.make_(self.pdf_, "pdf")

        # -log pdf
        self.nnlf_ = T.switch(
            T.or_(T.lt(self.X, self.low), T.ge(self.X, self.high)),
            np.inf,
            T.log(self.high - self.low)).ravel()
        self.make_(self.nnlf_, "nnlf")

        # cdf
        self.cdf_ = T.switch(
            T.lt(self.X, self.low),
            0.,
            T.switch(
                T.lt(self.X, self.high),
                (self.X - self.low) / (self.high - self.low),
                1.)).ravel()
        self.make_(self.cdf_, "cdf")

        # ppf
        self.ppf_ = self.p * (self.high - self.low) + self.low
        self.make_(self.ppf_, "ppf", args=[self.p])
Esempio n. 11
0
    def interval_reduction(a, b, c, d, tol):
        fc = f(c)
        fd = f(d)

        a, b, c, d = ifelse(T.lt(fc, fd), [a, d, d - golden_ratio * (d - a), c], [c, b, d, c + golden_ratio * (b - c)])

        stoprule = theano.scan_module.until(T.lt(T.abs_(c - d), tol))
        return [a, b, c, d], stoprule
Esempio n. 12
0
def rprop(param,learning_rate,gparam,mask,updates,current_cost,previous_cost,
          eta_plus=1.2,eta_minus=0.5,max_delta=50, min_delta=10e-6):
    previous_grad = sharedX(numpy.ones(param.shape.eval()),borrow=True)
    delta = sharedX(learning_rate * numpy.ones(param.shape.eval()),borrow=True)
    previous_inc = sharedX(numpy.zeros(param.shape.eval()),borrow=True)
    zero = T.zeros_like(param)
    one = T.ones_like(param)
    change = previous_grad * gparam

    new_delta = T.clip(
            T.switch(
                T.eq(gparam,0.),
                delta,
                T.switch(
                    T.gt(change,0.),
                    delta*eta_plus,
                    T.switch(
                        T.lt(change,0.),
                        delta*eta_minus,
                        delta
                    )
                )
            ),
            min_delta,
            max_delta
    )
    new_previous_grad = T.switch(
            T.eq(mask * gparam,0.),
            previous_grad,
            T.switch(
                T.gt(change,0.),
                gparam,
                T.switch(
                    T.lt(change,0.),
                    zero,
                    gparam
                )
            )
    )
    inc = T.switch(
            T.eq(mask * gparam,0.),
            zero,
            T.switch(
                T.gt(change,0.),
                - T.sgn(gparam) * new_delta,
                T.switch(
                    T.lt(change,0.),
                    zero,
                    - T.sgn(gparam) * new_delta
                )
            )
    )

    updates.append((previous_grad,new_previous_grad))
    updates.append((delta,new_delta))
    updates.append((previous_inc,inc))
    return param + inc * mask
Esempio n. 13
0
 def get_output_for(self, input, deterministic=False, **kwargs):
     if deterministic or self.rate == 0:
         return input
     else:
         drop = self._srng.uniform(input.shape)
         z = T.lt(drop, 0.5 * self.rate)
         o = T.lt(T.abs_(drop - 0.75 * self.rate), 0.25 * self.rate)
         input = T.set_subtensor(input[z.nonzero()], 0.)
         input = T.set_subtensor(input[o.nonzero()], 1.)
         return input
Esempio n. 14
0
def berhu(predictions, targets,s=0.2,l=0.5,m=1.2):
    # Compute mask
    mask = T.gt(targets, l) * T.lt(targets,m)

    # Compute n of valid pixels
    n_valid = T.sum(mask)
    # Redundant mult here 
    r = (predictions - targets) * mask
    c = s * T.max(T.abs_(r))
    a_r = T.abs_(r)
    b = T.switch(T.lt(a_r, c), a_r, ((r**2) + (c**2))/(2*c))
    return T.sum(b)/n_valid
Esempio n. 15
0
File: rae.py Progetto: zomux/nlpy
    def _bpts_step(self, i, gradient_reg, seqs, reps, inter_reps, left_subreps, right_subreps, rep_gradients):
        # BPTS
        seq = seqs[i]
        left, right, target = seq[0], seq[1], seq[2]

        left_is_token = T.lt(left, 0)
        right_is_token = T.lt(right, 0)

        bpts_gradient = gradient_reg[target]
        rep_gradient = rep_gradients[i] + bpts_gradient

        if self.deep:
            # Implementation note:
            # As the gradient of deep encoding func wrt W_ee includes the input representation.
            # If we let T.grad to find that input representation directly, it will stuck in an infinite loop.
            # So we must use SRG in this case.
            _fake_input_rep, = make_float_vectors("_fake_input_rep")
            deep_rep = self._deep_encode(_fake_input_rep)

            node_map = {deep_rep: reps[i], _fake_input_rep: inter_reps[i]}

            g_wee = SRG(T.grad(T.sum(deep_rep), self.W_ee), node_map) * rep_gradient
            g_bee = SRG(T.grad(T.sum(deep_rep), self.B_ee), node_map) * rep_gradient
            g_inter_rep = SRG(T.grad(T.sum(deep_rep), _fake_input_rep), node_map) * rep_gradient
            inter_rep = inter_reps[i]

        else:
            g_wee = T.constant(0)
            g_bee = T.constant(0)
            g_inter_rep = rep_gradient
            inter_rep = reps[i]

        # Accelerate computation by using saved internal values.
        # For the limitation of SRG, known_grads can not be used here.
        _fake_left_rep, _fake_right_rep = make_float_vectors("_fake_left_rep", "_fake_right_rep")
        rep_node = self._encode_computation(_fake_left_rep, _fake_right_rep)
        if self.deep:
            rep_node = self._deep_encode(rep_node)

        node_map = {_fake_left_rep: left_subreps[i], _fake_right_rep: right_subreps[i], rep_node: inter_rep}

        g_we1 = SRG(T.grad(T.sum(rep_node), self.W_e1), node_map) * g_inter_rep
        g_we2 = SRG(T.grad(T.sum(rep_node), self.W_e2), node_map) * g_inter_rep
        g_be = SRG(T.grad(T.sum(rep_node), self.B_e), node_map) * g_inter_rep

        g_left_p = SRG(T.grad(T.sum(rep_node), _fake_left_rep), node_map) * g_inter_rep
        g_right_p = SRG(T.grad(T.sum(rep_node), _fake_right_rep), node_map) * g_inter_rep

        gradient_reg = ifelse(left_is_token, gradient_reg, T.set_subtensor(gradient_reg[left], g_left_p))
        gradient_reg = ifelse(right_is_token, gradient_reg, T.set_subtensor(gradient_reg[right], g_right_p))

        return g_we1, g_we2, g_be, g_wee, g_bee, gradient_reg
Esempio n. 16
0
    def _step(
            i,
            pkm1, pkm2, qkm1, qkm2,
            k1, k2, k3, k4, k5, k6, k7, k8, r
    ):
        xk = -(x * k1 * k2) / (k3 * k4)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        xk = (x * k5 * k6) / (k7 * k8)
        pk = pkm1 + pkm2 * xk
        qk = qkm1 + qkm2 * xk
        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        old_r = r
        r = tt.switch(tt.eq(qk, zero), r, pk/qk)

        k1 += one
        k2 += k26update
        k3 += two
        k4 += two
        k5 += one
        k6 -= k26update
        k7 += two
        k8 += two

        big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG)
        biginv_cond = tt.or_(
            tt.lt(tt.abs_(qk), BIGINV),
            tt.lt(tt.abs_(pk), BIGINV)
        )

        pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2)
        pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1)
        qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2)
        qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1)

        pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2)
        pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1)
        qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2)
        qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1)

        return ((pkm1, pkm2, qkm1, qkm2,
                 k1, k2, k3, k4, k5, k6, k7, k8, r),
                until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))))
Esempio n. 17
0
    def rebuild(self):
        for i, (inputs, f) in enumerate(self.wiring):
            if not inputs:
                continue

            lin_comb = T.dot(T.concatenate([self._vlayers[j] for j in inputs], axis=1), self._vweights[i])
            add_biases = lin_comb + self._vbiases[i]
            self._vlayers[i] = f(add_biases)

        self._output = T.concatenate([self._vlayers[j] for j in self.output_layers], axis=1)

        self._targets = [T.matrix() for j in self.output_layers]
        crossentropy = sum([(T.nnet.categorical_crossentropy(self._vlayers[j], self._targets[i])
                             if self.wiring[j][1] == SOFTMAX_FUN
                             else ((self._vlayers[j] - self._targets[i]) ** 2 / (1+self._targets[i].max())**2).sum())
                            for i, j in enumerate(self.output_layers)
                            ])

        self._cost = (crossentropy.sum() + 
                      self.L2REG/(self.layers[i]) * sum((weight**2).sum() for weight in self._vweights if weight is not None)+ # + # L2 regularization
                      0.01* self.L2REG/math.sqrt(self.layers[i]) * sum((bias**2).sum() for j, bias in enumerate(self._vbiases) if bias is not None and self.wiring[j][1] != LINEAR_FUN))  # L2 regularization

        self._costnoreg = crossentropy.sum()

        self._derivatives = [None] * len(self.layers)
        self._updates = []

        MAX_DERIV = 1000
        for i, (inputs, f) in enumerate(self.wiring):
            if not inputs:
                continue
            deriv1 = T.grad(self._cost, self._vweights[i])
            deriv1p = T.switch(T.lt(deriv1, MAX_DERIV), deriv1, MAX_DERIV)
            deriv1pp = T.switch(T.gt(deriv1p, -MAX_DERIV), deriv1p, -MAX_DERIV)
            #deriv1ppp = T.switch(T.isnan(deriv1pp), 0, deriv1pp)
            deriv2 = T.grad(self._cost, self._vbiases[i])
            deriv2p = T.switch(T.lt(deriv2, MAX_DERIV), deriv2, MAX_DERIV)
            deriv2pp = T.switch(T.gt(deriv2p, -MAX_DERIV), deriv2p, -MAX_DERIV)
            #deriv2ppp = T.switch(T.isnan(deriv2pp), 0, deriv2pp)

            self._derivatives[i] = (deriv1pp, deriv2pp)

            self._updates.append((self._vweights[i], self._vweights[i] - self.learning_rate * self._derivatives[i][0]))
            self._updates.append((self._vbiases[i], self._vbiases[i] - self.learning_rate * self._derivatives[i][1]))
        self._prediction = theano.function(inputs=[self._vlayers[i] for i in self.input_layers],
                                           outputs=self._output)
        self._train = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers],
                                      outputs=self._cost,
                                      updates=self._updates, allow_input_downcast=True)
                                      #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) # debug NaN
        self._costfun = theano.function(inputs=self._targets+[self._vlayers[i] for i in self.input_layers],
                                      outputs=self._costnoreg, allow_input_downcast=True)
Esempio n. 18
0
def _forward_negative_z(inputs, weights, bias=None):
    inputs_plus = inputs * T.gt(inputs, 0)
    weights_plus = weights * T.gt(weights, 0)
    inputs_minus = inputs * T.lt(inputs, 0)
    weights_minus = weights * T.lt(weights, 0)
    negative_part_a = conv2d(inputs_plus, weights_minus)
    negative_part_b = conv2d(inputs_minus, weights_plus)
    together = negative_part_a + negative_part_b
    if bias is not None:
        bias_negative = bias * T.lt(bias, 0)
        together += bias_negative.dimshuffle("x", 0, "x", "x")

    return together
Esempio n. 19
0
def relevance_conv_a_b_sign_switch(inputs, weights, out_relevances, a, b, bias=None):
    assert a is not None
    assert b is not None
    assert a - b == 1
    # For each input, determine what
    outputs = conv2d(inputs, weights)
    if bias is not None:
        outputs += bias.dimshuffle("x", 0, "x", "x")
        # do not use bias further, only to determine direction of outputs
        bias = None
    # stabilize
    # prevent division by 0 and division by small numbers
    eps = 1e-4
    outputs += T.sgn(outputs) * eps
    outputs += T.eq(outputs, 0) * eps
    positive_forward = _forward_positive_z(inputs, weights, bias)
    negative_forward = _forward_negative_z(inputs, weights, bias)
    rel_for_positive_outputs = out_relevances * T.gt(outputs, 0)
    rel_for_negative_outputs = out_relevances * T.lt(outputs, 0)

    positive_norm_with_trend = positive_forward * T.gt(outputs, 0)
    negative_norm_with_trend = negative_forward * T.lt(outputs, 0)
    # minus to make overall norm positive
    norm_with_trend = positive_norm_with_trend - negative_norm_with_trend
    # stabilize also
    norm_with_trend += T.eq(norm_with_trend, 0) * eps

    in_positive_with_trend = _backward_positive_z(inputs, weights, rel_for_positive_outputs / norm_with_trend, bias)
    in_negative_with_trend = _backward_negative_z(inputs, weights, rel_for_negative_outputs / norm_with_trend, bias)

    # Minus in_negative since in_with_trend should not switch signs
    in_with_trend = in_positive_with_trend - in_negative_with_trend

    positive_norm_against_trend = positive_forward * T.lt(outputs, 0)
    negative_norm_against_trend = negative_forward * T.gt(outputs, 0)
    # minus to make overall norm positive
    norm_against_trend = positive_norm_against_trend - negative_norm_against_trend
    # stabilize also
    norm_against_trend += T.eq(norm_against_trend, 0) * eps

    in_positive_against_trend = _backward_positive_z(
        inputs, weights, rel_for_negative_outputs / norm_against_trend, bias
    )
    in_negative_against_trend = _backward_negative_z(
        inputs, weights, rel_for_positive_outputs / norm_against_trend, bias
    )
    # Minus in_negative since switching signs is done below
    in_against_trend = in_positive_against_trend - in_negative_against_trend

    in_relevances = a * in_with_trend - b * in_against_trend
    return in_relevances
Esempio n. 20
0
  def cubicBSpline(self, L):
    b = T.zeros_like(L)

    idx4 = T.ge(L, 0) * T.lt(L, 1)
    idx3 = T.ge(L, 1) * T.lt(L, 2)
    idx2 = T.ge(L, 2) * T.lt(L, 3)
    idx1 = T.ge(L, 3) * T.le(L, 4)

    b = T.switch(T.eq(idx4, 1), T.pow(L, 3) / 6, b)
    b = T.switch(T.eq(idx3, 1), (-3*T.pow(L-1,3) + 3*T.pow(L-1,2) + 3*(L-1) + 1) / 6, b)
    b = T.switch(T.eq(idx2, 1), ( 3*T.pow(L-2,3) - 6*T.pow(L-2,2)           + 4) / 6, b)
    b = T.switch(T.eq(idx1, 1), (-  T.pow(L-3,3) + 3*T.pow(L-3,2) - 3*(L-3) + 1) / 6, b)
    
    return b.T # b is K x K' and thus, as we multiply from the right with
Esempio n. 21
0
 def learning_updates(self):
     step = self.learning_rate
     self.grads = []
     self.steps = []
     for param in self.params:
         v = param.get_value()
         n = param.name
         self.grads.append(theano.shared(np.zeros_like(v),
                                         name=n + '_grad'))
         self.steps.append(
             theano.shared(np.zeros_like(v) + step, name=n + '_step'))
     for param, step_tm1, grad_tm1 in zip(self.params, self.steps,
                                          self.grads):
         grad = TT.grad(self.J, param)
         test = grad * grad_tm1
         same = TT.gt(test, 0)
         diff = TT.lt(test, 0)
         step = TT.minimum(
             self.max_step,
             TT.maximum(
                 self.min_step,
                 step_tm1 * (TT.eq(test, 0) + same * self.step_increase +
                             diff * self.step_decrease)))
         grad = grad - diff * grad
         yield param, param - TT.sgn(grad) * step
         yield grad_tm1, grad
         yield step_tm1, step
Esempio n. 22
0
  def __init__(self, config, loss, params):
    self._lr = get_shared_floatX(config.learning_rate, 'lr')
    self._t = get_shared_floatX(1, 't')
    self._all_m_tm1 = []
    self._all_v_tm1 = []
    self._updates = [(self._t, self._t + 1)]

    if config.lr_decay:
      lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq)
      self._updates.append((self._lr, lr_coef * config.learning_rate))

    grads = theano.grad(loss, params)

    self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads])))
    if config.max_grad_norm:
      global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm),
        cast_floatX_np(1.),
        cast_floatX(config.max_grad_norm/self._global_grad_norm))
      grads = [global_clip_factor * g for g in grads]

    lr_t = self._lr * \
      clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t))

    for p, g in zip(params, grads):
        m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name)
        v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name)
        self._all_m_tm1.append(m_tm1)
        self._all_v_tm1.append(v_tm1)
        m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g
        v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g)
        delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps)
        p_t = p + delta_t
        self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]
Esempio n. 23
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean((network_output - prediction_func)**2,
                                     axis=1)

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)).dot(
                J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Esempio n. 24
0
def normal_lcdf(mu, sigma, x):
    z = (x - mu) / sigma
    return tt.switch(
        tt.lt(z, -1.0),
        tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2.,
        tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.)
    )
Esempio n. 25
0
def incomplete_beta(a, b, value):
    '''Incomplete beta implementation
    Power series and continued fraction expansions chosen for best numerical
    convergence across the board based on inputs.
    '''
    machep = tt.constant(np.MachAr().eps, dtype='float64')
    one = tt.constant(1, dtype='float64')
    w = one - value

    ps = incomplete_beta_ps(a, b, value)

    flip = tt.gt(value, (a / (a + b)))
    aa, bb = a, b
    a = tt.switch(flip, bb, aa)
    b = tt.switch(flip, aa, bb)
    xc = tt.switch(flip, value, w)
    x = tt.switch(flip, w, value)

    tps = incomplete_beta_ps(a, b, x)
    tps = tt.switch(tt.le(tps, machep), one - machep, one - tps)

    # Choose which continued fraction expansion for best convergence.
    small = tt.lt(x * (a + b - 2.0) - (a - one), 0.0)
    cfe = incomplete_beta_cfe(a, b, x, small)
    w = tt.switch(small, cfe, cfe / xc)

    # Direct incomplete beta accounting for flipped a, b.
    t = tt.exp(a * tt.log(x) + b * tt.log(xc) + gammaln(a + b) - gammaln(a) -
               gammaln(b) + tt.log(w / a))

    t = tt.switch(flip, tt.switch(tt.le(t, machep), one - machep, one - t), t)
    return tt.switch(
        tt.and_(flip, tt.and_(tt.le((b * x), one), tt.le(x, 0.95))), tps,
        tt.switch(tt.and_(tt.le(b * value, one), tt.le(value, 0.95)), ps, t))
Esempio n. 26
0
    def apply(self, application_call,
              defs, def_mask):
        """
        Returns vector per each word in sequence using the dictionary based lookup
        """
        # Short listing
        defs = (T.lt(defs, self._num_input_words) * defs
                + T.ge(defs, self._num_input_words) * self._vocab.unk)
        # Memory bottleneck:
        # For instance (16101,52,300) ~= 32GB.
        # [(16786, 52, 1), (16786, 52, 100)]
        # TODO: Measure memory consumption here and check if it is in sensible range
        # or maybe introduce some control in Retrieval?
        defs_emb = self._def_lookup.apply(defs)
        application_call.add_auxiliary_variable(
            unk_ratio(defs, def_mask, self._vocab.unk),
            name='def_unk_ratio')

        if self._translate:
            logger.info("Translating in MeanPoolReadDefinitions")
            # Translate. Crucial for recovering useful information from embeddings
            defs_emb = self._def_translate.apply(defs_emb)

        def_emb_mask = def_mask[:, :, None]
        defs_emb = (def_emb_mask * defs_emb).sum(axis=1)
        if self._normalize:
            defs_emb = defs_emb / def_emb_mask.sum(axis=1)

        return defs_emb
    def output_probabilistic(self, m_x, v_x):

        m_linear = T.dot(m_x, self.m_W[ 0, :, : ]) + T.tile(self.m_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ])
        v_linear = T.dot(m_x**2, self.v_W[ 0, :, : ]) + T.dot(v_x, self.m_W[ 0, :, : ]**2) + T.dot(v_x, self.v_W[ 0, :, : ]) + \
            T.tile(self.v_b[ 0, :, : ], [ m_x.shape[ 0 ], 1 ])

        if not self.output_layer:

            # We compute the mean and variance after the ReLU activation

            alpha = m_linear / T.sqrt(v_linear)
            gamma = Network_layer.gamma(-alpha)
            gamma_robust = -alpha - 1.0 / alpha + 2.0 / alpha**3
            gamma_final = T.switch(T.lt(-alpha, T.fill(alpha, 30)), gamma, gamma_robust)

            v_aux = m_linear + T.sqrt(v_linear) * gamma_final

            m_a = Network_layer.n_cdf(alpha) * v_aux
            v_a = m_a * v_aux * Network_layer.n_cdf(-alpha) + Network_layer.n_cdf(alpha) * v_linear * (1 - gamma_final * (gamma_final + alpha))

            return (m_a, v_a)

        else:

            return (m_linear, v_linear)
Esempio n. 28
0
def normal_lcdf(mu, sigma, x):
    z = (x - mu) / sigma
    return tt.switch(
        tt.lt(z, -1.0),
        tt.log(tt.erfcx(-z / tt.sqrt(2.)) / 2.) - tt.sqr(z) / 2,
        tt.log1p(-tt.erfc(z / tt.sqrt(2.)) / 2.)
    )
Esempio n. 29
0
    def apply(self, v, **kwargs):
        input = v.output

        z = T.mean(input)
        stdev = T.std(input)

        nv = vcopy(v)
        histogram = []
        buckets = self.get_buckets()
        for beg, end in buckets:
            a = T.ge(input, beg)
            b = T.lt(input, end)
            percent = T.sum(a * b) / T.prod(input.shape).astype(floatX)
            histogram.append(percent)

        r = {
            'name': self.name,
            'mean': z,
            'stdev': stdev,
            'histogram': histogram
        }
        if 'activation_monitoring' in nv:
            nv.activation_monitoring.append(r)
        else:
            nv.activation_monitoring = [r]
        return self.post_apply(nv, **kwargs)
Esempio n. 30
0
 def get_embed_sampled(embed_tensor, sample=False):
     if not sample:
         randomization = 0.5
     else:
         randomization = theano_rng.uniform(size=embed_tensor.shape)
     return T.switch(T.lt(randomization, embed_tensor), 1.0,
                     0.0)  #(val,dim)
Esempio n. 31
0
    def piecewisePooling_feed(self, new_input):
        # mentions_batch = 句子数×1×88×60
        # eli_batch = 句子数×1
        mentions_batch, e1i_batch, e2i_batch = new_input
        # conv
        # input = 句子数×1×88×60
        # filter = 230×1×3×60
        self.conv_out = conv.conv2d(input=mentions_batch,
                                    filters=self.W,
                                    filter_shape=self.filter_shape,
                                    image_shape=self.image_shape)

        # conv_out=句子数×230×86×1

        # nonlinear_out = 句子数×230×86×1
        if self.non_linear.lower() == "tanh":
            # b是0
            self.nonlinear_out = T.tanh(self.conv_out +
                                        self.b.dimshuffle('x', 0, 'x', 'x'))
        elif self.non_linear.lower() == "relu":
            self.nonlinear_out = T.nnet.relu(
                self.conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
        else:
            raise NotImplementedError

        # pooling
        # filter_h = 3
        filter_h = self.filter_shape[2]
        # n_pad_head = 4
        n_pad_head = conf.getint('settings', 'max_filter_h') - 1
        assert n_pad_head == 4
        # numpy.floor向上取整
        # 反正偏移了3个单位
        idx_shift = n_pad_head - int(numpy.floor(
            filter_h / 2))  # 经过pad和convolution后, 相对于e1i, e2i(pad前)偏移了多少.

        e1i_conved = e1i_batch + idx_shift
        e2i_conved = e2i_batch + idx_shift

        # 得到每个句子左实体的位置 + 1和右实体的位置 + 1
        [m_seg2_st_batch, m_seg3_st_batch], _ = \
            theano.scan(fn=lambda e1i, e2i: ifelse(T.lt(e1i, e2i), (e1i + 1, e2i + 1), (e2i + 1, e1i + 1)),
                        sequences=[e1i_conved, e2i_conved])

        nonlinear_out_3d = self.nonlinear_out.flatten(3)

        def piecewise_pooling(conved_m, m_seg2_st, m_seg3_st):
            seg1_out = T.max(conved_m[:, :m_seg2_st], axis=1)
            seg2_out = T.max(conved_m[:, m_seg2_st:m_seg3_st], axis=1)
            seg3_out = T.max(conved_m[:, m_seg3_st:], axis=1)
            return T.transpose(T.stack(
                (seg1_out, seg2_out, seg3_out))).flatten()

        # 对于每一个句子返回一个230×3的向量
        pooling_2d, _ = theano.scan(
            fn=piecewise_pooling,
            sequences=[nonlinear_out_3d, m_seg2_st_batch, m_seg3_st_batch])

        self.input = new_input
        self.output = pooling_2d
Esempio n. 32
0
def irprop_minus_updates(params, grads):

    # IRPROP- parameters
    updates = []
    deltas = 0.1 * numpy.ones(len(params), theano.config.floatX)
    last_params = params

    positiveStep = 1.2
    negativeStep = 0.5
    maxStep = 50  #1.
    minStep = math.exp(-6)

    for param, gparam, delta, last_gparam in zip(params, grads, deltas,
                                                 last_params):
        # calculate change
        change = T.sgn(gparam * last_gparam)
        if T.gt(change, 0):
            delta = T.minimum(delta * positiveStep, maxStep)

        elif T.lt(change, 0):
            delta = T.maximum(delta * negativeStep, minStep)

            last_gparam = 0
        delta = delta.astype('float32')

        # update the weights
        updates.append((param, param - T.sgn(gparam) * delta))
        # store old change
        last_gparam = gparam

    return updates
Esempio n. 33
0
def _span_sums(stt, end, p_lens, max_p_len, batch_size, dim, max_ans_len):
  # Sum of every start element and corresponding max_ans_len end elements.
  #
  # stt     (max_p_len, batch_size, dim)
  # end     (max_p_len, batch_size, dim)
  # p_lens  (batch_size,)
  max_ans_len_range = tt.shape_padleft(tt.arange(max_ans_len))          # (1, max_ans_len)
  offsets = tt.shape_padright(tt.arange(max_p_len))                     # (max_p_len, 1)
  end_idxs = max_ans_len_range + offsets                                # (max_p_len, max_ans_len)
  end_idxs_flat = end_idxs.flatten()                                    # (max_p_len*max_ans_len,)

  end_padded = tt.concatenate(                                          # (max_p_len+max_ans_len-1, batch_size, dim)
    [end, tt.zeros((max_ans_len-1, batch_size, dim))], axis=0)    
  end_structured = end_padded[end_idxs_flat]                            # (max_p_len*max_ans_len, batch_size, dim)
  end_structured = end_structured.reshape(                              # (max_p_len, max_ans_len, batch_size, dim)
    (max_p_len, max_ans_len, batch_size, dim))
  stt_shuffled = stt.dimshuffle((0,'x',1,2))                            # (max_p_len, 1, batch_size, dim)

  span_sums = stt_shuffled + end_structured                             # (max_p_len, max_ans_len, batch_size, dim)
  span_sums_reshaped = span_sums.dimshuffle((2,0,1,3)).reshape(         # (batch_size, max_p_len*max_ans_len, dim)
    (batch_size, max_p_len*max_ans_len, dim))

  p_lens_shuffled = tt.shape_padright(p_lens)                           # (batch_size, 1)
  end_idxs_flat_shuffled = tt.shape_padleft(end_idxs_flat)              # (1, max_p_len*max_ans_len)

  span_masks_reshaped = tt.lt(end_idxs_flat_shuffled, p_lens_shuffled)  # (batch_size, max_p_len*max_ans_len)
  span_masks_reshaped = cast_floatX(span_masks_reshaped)

  # (batch_size, max_p_len*max_ans_len, dim), (batch_size, max_p_len*max_ans_len)
  return span_sums_reshaped, span_masks_reshaped
Esempio n. 34
0
    def _activation(self, Y, L, M, W):
        """Returns the activation for a given input.

        Derived from the generative model formulation of hierarchical
        Poisson mixtures, the formular for the activation in the network
        reads as follows:
        I_c =
         \sum_d \log(W_{cd})y_d + \log(M_{lc})        for labeled data
         \sum_d \log(W_{cd})y_d + \log(\sum_k M_{kc}) for unlabeled data
        s_c = softmax(I_c)
        """
        # first: complete inference to find label
        # Input integration:
        I = T.tensordot(Y, T.log(W), axes=[1, 1])
        # recurrent term:
        vM = M[L]
        L_index = T.eq(L, -1).nonzero()
        vM = T.set_subtensor(vM[L_index], T.sum(M, axis=0))
        # numeric trick to prevent overflow in the exp-function
        max_exponent = 86. - T.ceil(T.log(I.shape[1].astype('float32')))
        scale = T.switch(T.gt(T.max(I, axis=1, keepdims=True), max_exponent),
                         T.max(I, axis=1, keepdims=True) - max_exponent, 0.)
        # numeric approximation to prevent underflow in the exp-function:
        # map too low values of I to a fixed minimum value
        min_exponent = -87. + T.ceil(T.log(I.shape[1].astype('float32')))
        I = T.switch(T.lt(I - scale, min_exponent), scale + min_exponent, I)
        # activation: recurrent softmax with overflow protection
        s = vM * T.exp(I - scale) / T.sum(
            vM * T.exp(I - scale), axis=1, keepdims=True)
        return s
Esempio n. 35
0
    def __init__(self,
                 f,
                 θs,
                 α=0.001,
                 β1=0.9,
                 β2=0.999,
                 β3=0.999,
                 k=0.1,
                 K=10.,
                 ε=1e-8,
                 dec=0.):
        α, β1, β2, β3, ε, dec = [
            np.cast[floatX](h) for h in [α, β1, β2, β3, ε, dec]
        ]

        t = theano.shared(0, name="t")
        t_u = (t, t + 1)

        f_prev = theano.shared(np.cast[floatX](0), name="f_prev")

        ch_fact_lbound = T.switch(T.gt(f, f_prev), 1 + k, 1 / (1 + K))
        ch_fact_ubound = T.switch(T.gt(f, f_prev), 1 + K, 1 / (1 + k))
        f_ch_fact = f / f_prev
        f_ch_fact = T.switch(T.lt(f_ch_fact, ch_fact_lbound), ch_fact_lbound,
                             f_ch_fact)
        f_ch_fact = T.switch(T.gt(f_ch_fact, ch_fact_ubound), ch_fact_ubound,
                             f_ch_fact)
        f_hat = T.switch(T.gt(t_u[1], 1), f_prev * f_ch_fact, f)
        f_u = (f_prev, f_hat)

        self.ms = [
            theano.shared(np.zeros(θ.shape.eval(), dtype=floatX),
                          borrow=True,
                          name="m") for θ in θs
        ]
        self.vs = [
            theano.shared(np.zeros(θ.shape.eval(), dtype=floatX),
                          borrow=True,
                          name="v") for θ in θs
        ]

        d = theano.shared(one, name="d")
        d_den = T.switch(T.gt(f_hat, f_prev), f_prev, f_hat)
        d_t = (β3 * d) + (one - β3) * T.abs_((f_hat - f_prev) / d_den)
        d_t = T.switch(T.gt(t_u[1], one), d_t, one)
        d_u = (d, d_t)

        gs = T.grad(f, θs)

        m_us = [(m, β1 * m + (one - β1) * g) for m, g in zip(self.ms, gs)]
        m_hats = [m_u[1] / (one - T.pow(β1, t_u[1])) for m_u in m_us]

        v_us = [(v, β2 * v + (one - β2) * T.sqr(g))
                for v, g in zip(self.vs, gs)]
        v_hats = [v_u[1] / (one - T.pow(β2, t_u[1])) for v_u in v_us]

        θ_us = [(θ, θ - (α / (one + (t_u[1] * dec))) * m_hat /
                 ((T.sqrt(v_hat) * d_t) + ε))
                for θ, m_hat, v_hat in zip(θs, m_hats, v_hats)]
        self.updates = m_us + v_us + [t_u, f_u, d_u] + θ_us
Esempio n. 36
0
    def __init__(self, inverse_scale=1.0):
        """Constructor.

        Parameters
        ----------
        * `inverse_scale` [float]:
            The inverse scale.
        """
        super(Exponential, self).__init__(inverse_scale=inverse_scale)

        # pdf
        self.pdf_ = T.switch(
            T.lt(self.X, 0.), 0.,
            self.inverse_scale * T.exp(-self.inverse_scale * self.X)).ravel()
        self._make(self.pdf_, "pdf")

        # -log pdf
        self.nll_ = bound(
            -T.log(self.inverse_scale) + self.inverse_scale * self.X, np.inf,
            self.inverse_scale > 0.).ravel()
        self._make(self.nll_, "nll")

        # cdf
        self.cdf_ = (1. - T.exp(-self.inverse_scale * self.X)).ravel()
        self._make(self.cdf_, "cdf")

        # ppf
        self.ppf_ = -T.log(1. - self.p) / self.inverse_scale
        self._make(self.ppf_, "ppf", args=[self.p])
Esempio n. 37
0
    def negativeLogLikelihood(self, y, weightPerClass):
        # Used in training.
        # param y: y = T.itensor4('y'). Dimensions [batchSize, r, c, z]
        # weightPerClass is a vector with 1 element per class.

        #Weighting the cost of the different classes in the cost-function, in order to counter class imbalance.
        e1 = np.finfo(np.float32).tiny
        addTinyProbMatrix = T.lt(self.p_y_given_x_train, 4 * e1) * e1

        weightPerClassBroadcasted = weightPerClass.dimshuffle(
            'x', 0, 'x', 'x', 'x')
        log_p_y_given_x_train = T.log(
            self.p_y_given_x_train + addTinyProbMatrix
        )  #added a tiny so that it does not go to zero and I have problems with nan again...
        weighted_log_p_y_given_x_train = log_p_y_given_x_train * weightPerClassBroadcasted
        # return -T.mean( weighted_log_p_y_given_x_train[T.arange(y.shape[0]), y] )

        # Not a very elegant way to do the indexing but oh well...
        indexDim0 = T.arange(
            weighted_log_p_y_given_x_train.shape[0]).dimshuffle(
                0, 'x', 'x', 'x')
        indexDim2 = T.arange(
            weighted_log_p_y_given_x_train.shape[2]).dimshuffle(
                'x', 0, 'x', 'x')
        indexDim3 = T.arange(
            weighted_log_p_y_given_x_train.shape[3]).dimshuffle(
                'x', 'x', 0, 'x')
        indexDim4 = T.arange(
            weighted_log_p_y_given_x_train.shape[4]).dimshuffle(
                'x', 'x', 'x', 0)
        return -T.mean(weighted_log_p_y_given_x_train[indexDim0, y, indexDim2,
                                                      indexDim3, indexDim4])
Esempio n. 38
0
def relevance_conv_a_b_abs(inputs, weights, out_relevances, a, b, bias=None):
    assert a is not None
    assert b is not None
    assert a - b == 1
    weights_plus = weights * T.gt(weights, 0)
    weights_neg = weights * T.lt(weights, 0)

    plus_norm = conv2d(T.abs_(inputs), weights_plus)
    # stabilize, prevent division by 0
    eps = 1e-4
    plus_norm += T.eq(plus_norm, 0) * eps
    plus_rel_normed = out_relevances / plus_norm
    in_rel_plus = conv2d(plus_rel_normed, weights_plus.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full")
    in_rel_plus *= T.abs_(inputs)

    # minuses to get positive outputs, since will be subtracted
    # at end of function
    neg_norm = -conv2d(T.abs_(inputs), weights_neg)
    neg_norm += T.eq(neg_norm, 0) * eps
    neg_rel_normed = out_relevances / neg_norm
    in_rel_neg = -conv2d(neg_rel_normed, weights_neg.dimshuffle(1, 0, 2, 3)[:, :, ::-1, ::-1], border_mode="full")
    in_rel_neg *= T.abs_(inputs)

    in_relevance = a * in_rel_plus - b * in_rel_neg
    return in_relevance
    def get_hid(self, X, X_embedded):
        """ Get the hidden layers for the recognition RNN given a batch of N sentences

        :param X:               (N x max(L)) matrix representing the text
        :param X_embedded:      (N x max(L) x E) tensor representing the embedded text

        :return:                rnn_depth -dimensional list of hidden states for the recognition RNN
        """

        # If x is less or equal than 0 then return 0, else 1 (exclude unused words)
        mask = T.switch(T.lt(X, 0), 0, 1)                                           # N x max(L)

        h_prev = X_embedded                                                         # N x max(L) x E

        all_h = []

        for h in range(len(self.rnn)):

            h_prev = self.rnn[h].get_output_for([h_prev, mask])                     # N x max(L) x dim(hid)

            all_h.append(h_prev[:, -1])

        hid = T.concatenate(all_h, axis=-1)

        return hid
Esempio n. 40
0
def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
    # nvcc by default downcast them to float32. We would need to tell him not
    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
    bv = numpy.ones((4, 4), dtype='float32')

    def get_all_basic_scalar(composite_op):
        l = []
        for i in composite_op.fgraph.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
                 mode_with_gpu.excluding('elemwise_fusion')]:
        f = pfunc([a, b],
                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)

        out = f(av, bv)
        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.fgraph.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
                        assert not any([i.type.dtype == 'float64'
                                        for i in s.inputs + s.outputs])
Esempio n. 41
0
def VINormal(dim, const_str, const_fx, K, nfit=30000):
    """\
    Normal (full-rank) sampling, fit with ADVI to a
    high-potential probability distribution



    :input dim:       The dimensionality
    :input const_str: Constraint strings; used to define potentials
    :input const_fx:  Constraint callables, included for API compatibility
    :input K:         Number of points to sample
    :input nfit:      Number of gradient iterations for variational inference

    :returns: A set of points X drawn from a N(μ,Σ); where the parameters are fit
              by variational inference to match the potential distribution formed
              by the potentials -c*g_i; for c=7500


    """
    with pm.Model() as mod:
        x = pm.Uniform('x', shape=dim)
        for i, const in enumerate(const_str):
            cname = 'g%d' % i
            g = pm.Deterministic(cname, eval(const, {'__builtins__': None}, {'x': x } ))
            pname = '%s_pot' % cname
            pm.Potential(pname, tt.switch(tt.lt(g, 0), 7500*g, 0))
        fit_res = pm.fit(nfit, method='fullrank_advi', obj_n_mc=3)
        trace = fit_res.sample(K)
    return trace['x']
Esempio n. 42
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean(
            (network_output - prediction_func) ** 2,
            axis=1
        )

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)
        ).dot(J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Esempio n. 43
0
 def grad(self, args, g_outs):
     return [
         T.switch(
             T.or_(T.lt(g_out, self.lower_bound),
                   T.gt(g_out, self.upper_bound)),
             T.cast(0, dtype=g_out.dtype), g_out) for g_out in g_outs
     ]
Esempio n. 44
0
def relevance_conv_a_b_abs(inputs, weights, out_relevances, a,b, bias=None):
    assert a is not None
    assert b is not None
    assert a - b == 1
    weights_plus = weights * T.gt(weights, 0)
    weights_neg = weights * T.lt(weights, 0)
    
    plus_norm = conv2d(T.abs_(inputs), weights_plus)
    # stabilize, prevent division by 0
    eps=1e-4
    plus_norm += (T.eq(plus_norm,0) * eps)
    plus_rel_normed = out_relevances / plus_norm
    in_rel_plus = conv2d(plus_rel_normed, 
          weights_plus.dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
          border_mode='full')
    in_rel_plus *= T.abs_(inputs)
    
    # minuses to get positive outputs, since will be subtracted
    # at end of function
    neg_norm = -conv2d(T.abs_(inputs), weights_neg)
    neg_norm += (T.eq(neg_norm,0) * eps)
    neg_rel_normed = out_relevances / neg_norm
    in_rel_neg = -conv2d(neg_rel_normed, 
          weights_neg.dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
          border_mode='full')
    in_rel_neg *= T.abs_(inputs)

    in_relevance = a * in_rel_plus - b * in_rel_neg
    return in_relevance
Esempio n. 45
0
def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
    # nvcc by default downcast them to float32. We would need to tell him not
    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
    bv = numpy.ones((4, 4), dtype='float32')

    def get_all_basic_scalar(composite_op):
        l = []
        for i in composite_op.env.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
                 mode_with_gpu.excluding('elemwise_fusion')]:
        f = pfunc([a, b],
                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)

        out = f(av, bv)
        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.env.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
                        assert not any([i.type.dtype == 'float64'
                                        for i in s.inputs + s.outputs])
Esempio n. 46
0
def angle_axis_to_rotation_matrix(angle_axis):
    n = T.sqrt(T.sum(angle_axis**2))

    def aa2R():
        angle_axis_normalized = angle_axis / n
        x = angle_axis_normalized[0]
        y = angle_axis_normalized[1]
        z = angle_axis_normalized[2]
        s, c = T.sin(n), T.cos(n)
        R = T.zeros((3, 3), dtype=angle_axis.dtype)
        R = T.set_subtensor(R[0, 0], x * x + (1 - x * x) * c)
        R = T.set_subtensor(R[0, 1], x * y * (1 - c) - z * s)
        R = T.set_subtensor(R[0, 2], x * z * (1 - c) + y * s)

        R = T.set_subtensor(R[1, 0], x * y * (1 - c) + z * s)
        R = T.set_subtensor(R[1, 1], y * y + (1 - y * y) * c)
        R = T.set_subtensor(R[1, 2], y * z * (1 - c) - x * s)

        R = T.set_subtensor(R[2, 0], x * z * (1 - c) - y * s)
        R = T.set_subtensor(R[2, 1], z * y * (1 - c) + x * s)
        R = T.set_subtensor(R[2, 2], z * z + (1 - z * z) * c)
        return R

    return th.ifelse.ifelse(T.lt(n, .0001), T.eye(3, dtype=angle_axis.dtype),
                            aa2R())
Esempio n. 47
0
def MCMC(dim, const_str, const_fx, K, chains=3, cores=3):
    """\
    MCMC sampling, with potentials lowering the probability
    of drawing failing points

    :input dim:       The dimensionality
    :input const_str: Constraint strings; used to define potentials
    :input const_fx:  Constraint callables, included for API compatibility
    :input K:         Number of points to sample
    :input chains:    Number of independent MCMC chains to run
    :input cores:     Number of CPU cores to run for parallelization

    :returns: A set of points X drawn from the potential -c*g_i; for c=[1, 10, 20].
              This involves three successive samplings, which should total K draws.

    """
    lambda_values = [1, 10, 20]
    k = int(K/(chains*len(lambda_values)))
    Xvals = list()
    for lam in lambda_values:
        with pm.Model() as mod:
            x = pm.Uniform('x', shape=dim)
            for i, const in enumerate(const_str):
                cname = 'g%d' % i
                g = pm.Deterministic(cname, eval(const, {'__builtins__': None}, {'x': x } ))
                pname = '%s_pot' % cname
                pm.Potential(pname, tt.switch(tt.lt(g, 0), lam*g, 0))
            trace = pm.sample(k, tune=1000, chains=chains, cores=cores)
        Xvals.append(trace['x'])
    return np.vstack(Xvals)
Esempio n. 48
0
def _best_path_decode(activations):
    """Calculate the CTC best-path decoding for a given activation sequence.
       In the returned matrix, shorter sequences are padded with -1s."""

    # For each timestep, get the highest output
    decoding = T.argmax(activations, axis=2)

    # prev_outputs[time][example] == decoding[time - 1][example]
    prev_outputs = T.concatenate([T.alloc(_BLANK, 1, decoding.shape[1]), decoding], axis=0)[:-1]

    # Filter all repetitions to zero (blanks are already zero)
    decoding = decoding * T.neq(decoding, prev_outputs)

    # Calculate how many blanks each sequence has relative to longest sequence
    blank_counts = T.eq(decoding, 0).sum(axis=0)
    min_blank_count = T.min(blank_counts, axis=0)
    max_seq_length = decoding.shape[0] - min_blank_count # used later
    padding_needed = blank_counts - min_blank_count

    # Generate the padding matrix by ... doing tricky things
    max_padding_needed = T.max(padding_needed, axis=0)
    padding_needed = padding_needed.dimshuffle('x',0).repeat(max_padding_needed, axis=0)
    padding = T.arange(max_padding_needed).dimshuffle(0,'x').repeat(decoding.shape[1],axis=1)
    padding = PADDING * T.lt(padding, padding_needed)

    # Apply the padding
    decoding = T.concatenate([decoding, padding], axis=0)

    # Remove zero values
    nonzero_vals = decoding.T.nonzero_values()
    decoding = T.reshape(nonzero_vals, (decoding.shape[1], max_seq_length)).T

    return decoding
Esempio n. 49
0
 def softmax(self, D, I):
   D = D * T.constant(self.attrs['sharpening'], 'float32')
   if self.attrs['norm'] == 'exp':
     E = T.exp(-D) * I
     E = E / T.maximum(T.sum(E,axis=0,keepdims=True),T.constant(1e-20,'float32'))
   elif self.attrs['norm'] == 'sigmoid':
     E = (numpy.float32(1) - T.tanh(D)**2) * I
   elif self.attrs['norm'] == 'lstm':
     n_out = self.attrs['template']
     def lstm(z, i_t, s_p, h_p):
       z += T.dot(h_p, self.N_re)
       i = T.outer(i_t, T.alloc(numpy.cast['int8'](1), n_out))
       ingate = T.nnet.sigmoid(z[:,n_out: 2 * n_out])
       forgetgate = T.nnet.sigmoid(z[:,2 * n_out:3 * n_out])
       outgate = T.nnet.sigmoid(z[:,3 * n_out:])
       input = T.tanh(z[:,:n_out])
       s_t = input * ingate + s_p * forgetgate
       h_t = T.tanh(s_t) * outgate
       return theano.gradient.grad_clip(s_t * i, -50, 50), h_t * i
     E, _ = theano.scan(lstm, sequences=[D,I], outputs_info=[T.zeros((n_out,), 'float32'), T.zeros((n_out,), 'int32')])
     E = T.nnet.sigmoid(T.dot(E,self.N_out))
   else:
     raise NotImplementedError()
   if self.attrs['nbest'] > 1:
     opt = T.minimum(self.attrs['nbest'], E.shape[0])
     score = (T.sort(E, axis=0)[-opt]).dimshuffle('x',0).repeat(E.shape[0],axis=0)
     E = T.switch(T.lt(E,score), T.zeros_like(E), E)
   return E
Esempio n. 50
0
def ALB_softmax_health_weighting(o, t, o2, health, v, alpha_0, beta_0, alpha_1, beta_1, d, tau_p, tau_n, unchosen_p, b,
                                 tau_p_w, tau_n_w, decay_w):

    # Without variance weighting
    b = 1. / b  # Convert inverse temperature to temperature

    unchosen_0 = T.switch(T.le(v, 0.5), 1, unchosen_p)
    unchosen_1 = T.switch(T.gt(v, 0.5), 1, unchosen_p)

    health = T.switch(T.lt(health, 0), 0, health)
    tau_p = T.switch(T.ge(tau_p, 0), tau_p * (1 - tau_p_w * health), tau_p * (1 - (1 + tau_p_w * health)))
    tau_n = T.switch(T.ge(tau_n, 0), tau_n * (1 - tau_n_w * health), tau_n * (1 - (1 + tau_n_w * health)))
    d = T.switch(T.ge(tau_p, 0), d * (1 - decay_w * health), d * (1 - (1 + decay_w * health)))

    # Only update if outcome isn't missing
    alpha_0 = T.switch(T.ge(o, 0), (1 - d) * alpha_0 + (o * tau_p * unchosen_0), alpha_0)
    beta_0 = T.switch(T.ge(o, 0), (1 - d) * beta_0 + ((1 - o) * tau_n * unchosen_0), beta_0)

    alpha_1 = T.switch(T.ge(o2, 0), (1 - d) * alpha_1 + (o2 * tau_p * unchosen_1), alpha_1)
    beta_1 = T.switch(T.ge(o2, 0), (1 - d) * beta_1 + ((1 - o2) * tau_n * unchosen_1), beta_1)

    value_0 = alpha_0 / (alpha_0 + beta_0)
    value_1 = alpha_1 / (alpha_1 + beta_1)

    value = ((value_0 - value_1) + 1) / 2.

    var_0 = (alpha_0 * beta_0) / (T.pow(alpha_0 + beta_0, 2) * (alpha_0 + beta_0 + 1))
    var_1 = (alpha_1 * beta_1) / (T.pow(alpha_1 + beta_1, 2) * (alpha_1 + beta_1 + 1))

    value = np.exp(b * value) / (np.exp(b * value) + np.exp(b * (1 - value)))

    return (value, alpha_0, beta_0, alpha_1, beta_1, var_0, var_1, value_0, value_1, o, o2, unchosen_0, unchosen_1)
Esempio n. 51
0
def relevance_conv_z_b(out_relevances, inputs, weights, min_in, max_in, bias=None):
    # min in /max in can be symbolic or number, so no way to check
    # any assertions here  
    if bias is not None:
        log.warning("Bias not respected for conv z_b")
    weights_b = T.lt(weights, 0) * weights * -max_in
    weights_b += T.gt(weights, 0) * weights * -min_in

    norms_for_relevances = conv2d(inputs, weights)
    norms_for_relevances += T.sum(weights_b, axis=(1,2,3)).dimshuffle(
        'x',0,'x','x')
    # prevent division by 0...
    norms_for_relevances += T.eq(norms_for_relevances, 0) * 1
    normed_relevances = out_relevances / norms_for_relevances
    # upconv data
    in_relevances_data = conv2d(normed_relevances, 
                           weights.dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
                           border_mode='full')
    in_relevances_data *= inputs
    # upconv weight offsets to enforce positivity
    in_relevances_b = conv2d(normed_relevances, 
                           weights_b.dimshuffle(1,0,2,3)[:,:,::-1,::-1], 
                           border_mode='full')
    in_relevances = in_relevances_data + in_relevances_b
    return in_relevances
Esempio n. 52
0
    def __init__(self, grads, p, b1, b2, alpha, epsilon=10e-8):

        # Perform Gradient Clipping
        grad_norm = grads.norm(L=2)
        grads = T.switch(T.lt(1.0, grad_norm), grads / grad_norm, grads)

        #self.L = L
        self.p = p
        self.b1 = b1
        self.b2 = b2
        self.alpha = alpha

        self.t = theano.shared(value=numpy.cast[theano.config.floatX](1.0))
        self.t_next = self.t + 1

        self.g = grads.astype(dtype=theano.config.floatX)
        self.m = theano.shared(value=numpy.zeros_like(
            p.get_value(), dtype=theano.config.floatX),
                               name='m',
                               borrow=True,
                               broadcastable=self.p.broadcastable)
        self.m_next = self.b1 * self.m + (1 - self.b1) * self.g
        self.v = theano.shared(value=numpy.zeros_like(
            p.get_value(), dtype=theano.config.floatX),
                               name='v',
                               borrow=True,
                               broadcastable=self.p.broadcastable)
        self.v_next = b2 * self.v + (1 - self.b2) * self.g * self.g
        self.m_ub = self.m / (1 - b1**self.t)
        self.v_ub = self.v / (1 - b2**self.t)
        self.update = self.p - alpha * self.m_ub / (T.sqrt(self.v_ub) +
                                                    epsilon)

        self.updates = [(self.t, self.t_next), (self.m, self.m_next),
                        (self.v, self.v_next), (self.p, self.update)]
Esempio n. 53
0
    def convert_method(self, method_string):

        if method_string == 'sigmoid':
            return Tensor.nnet.sigmoid

        elif method_string == 'tanh':
            return Tensor.tanh

        elif method_string == 'scaled_tanh':
            return lambda x: 1.7159 * Tensor.tanh(0.66 * x)

        elif method_string == 'soft_sigmoid':
            return soft_sigmoid

        elif method_string == 'relu':
            return lambda x: x * (x > 0)

        elif method_string == 'relu2':
            return lambda x: Tensor.switch(Tensor.lt(x, -1), -1, x) * Tensor.switch(Tensor.gt(x, 1), 1, x) / x

        elif method_string == 'leakyrelu':
            return lambda x: x * (x > 0) + 0.01 * x * (x < 0)

        elif method_string == 'shiftedrelu':
            return lambda x: x * (x > -1)

        elif method_string == 'hard_sigmoid':
            return Tensor.nnet.hard_sigmoid

        elif method_string == 'none':
            return lambda x: x

        else:
            raise Exception('method unknown')
Esempio n. 54
0
 def loop(idx):
     if T.lt(idx, 0):
         return size + idx
     if T.ge(idx, size):
         return idx - size
     else:
         return idx
Esempio n. 55
0
    def _editdist(s, t):
        """
        Levenshtein's edit distance function
        :param s: vector, source string
        :param t: vector, target string
        :return:  edit distance, scalar
        """
        def update(x, previous_row):
            current_row = previous_row + 1
            current_row = tensor.set_subtensor(
                current_row[1:],
                tensor.minimum(
                    current_row[1:],
                    tensor.add(previous_row[:-1], tensor.neq(target, x))))
            current_row = tensor.set_subtensor(
                current_row[1:],
                tensor.minimum(current_row[1:], current_row[0:-1] + 1))
            return current_row

        source, target = ifelse(tensor.lt(s.shape[0], t.shape[0]), (t, s),
                                (s, t))
        previous_row = tensor.arange(target.size + 1,
                                     dtype=theano.config.floatX)
        result, updates = theano.scan(fn=update,
                                      sequences=source,
                                      outputs_info=previous_row,
                                      name='editdist')
        return result[-1, -1]
def rprop_core(params,
               gradients,
               rprop_increase=1.01,
               rprop_decrease=0.99,
               rprop_min_step=0,
               rprop_max_step=100,
               learning_rate=0.01):
    """
    Rprop optimizer.
    See http://sci2s.ugr.es/keel/pdf/algorithm/articulo/2003-Neuro-Igel-IRprop+.pdf.
    """
    for param, grad in zip(params, gradients):
        grad_tm1 = theano.shared(np.zeros_like(param.get_value()),
                                 name=param.name + '_grad')
        step_tm1 = theano.shared(np.zeros_like(param.get_value()) +
                                 learning_rate,
                                 name=param.name + '_step')

        test = grad * grad_tm1
        same = T.gt(test, 0)
        diff = T.lt(test, 0)
        step = T.minimum(
            rprop_max_step,
            T.maximum(
                rprop_min_step,
                step_tm1 * (T.eq(test, 0) + same * rprop_increase +
                            diff * rprop_decrease)))
        grad = grad - diff * grad
        yield param, param - T.sgn(grad) * step
        yield grad_tm1, grad
        yield step_tm1, step
Esempio n. 57
0
    def test_ifelse(self):
        config1 = theano.config.profile
        config2 = theano.config.profile_memory

        try:
            theano.config.profile = True
            theano.config.profile_memory = True

            a, b = T.scalars('a', 'b')
            x, y = T.scalars('x', 'y')

            z = ifelse(T.lt(a, b), x * 2, y * 2)

            p = theano.ProfileStats(False)

            if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
                m = "FAST_RUN"
            else:
                m = None

            f_ifelse = theano.function([a, b, x, y], z, profile=p, name="test_ifelse",
                                       mode=m)

            val1 = 0.
            val2 = 1.
            big_mat1 = 10
            big_mat2 = 11

            out = f_ifelse(val1, val2, big_mat1, big_mat2)

        finally:
            theano.config.profile = config1
            theano.config.profile_memory = config2
Esempio n. 58
0
 def get_output_for(self, input, deterministic=False, **kwargs):
     if deterministic:
         return self.p * input
     else:
         return theano.ifelse.ifelse(
             T.lt(self._srng.uniform((1, ), 0, 1)[0], self.p), input,
             T.zeros(input.shape))
Esempio n. 59
0
 def apply(self , src , mask_length , tgt):
     """
         viterbi algorithm
     """
     result , updates = theano.scan(
         fn = self.train_step,
         sequences = src,
         outputs_info = [self.A_start, None] ,
         non_sequences = self.A ,
         n_steps = mask_length
     )
     # the score of best path
     best_path_score = result[0][-1].max()
     idx = T.argmax(result[0][-1])
     #backtracking
     res2 , _ = theano.scan(
         fn = lambda dps , idx , idx2 : [dps[idx] , idx],
         sequences = result[1][::-1],
         outputs_info = [idx , idx],
         n_steps = mask_length
     )
     # the path of best score
     best_path = res2[1]
     #if len(best_path) < seq_len:
     #    best_path.extend((seq_len - len(best_path)) * [2])
     # the score of tgt path
     tgt_score = self.decode(src , mask_length , tgt)
     # max_margin
     max_margin = T.sum(T.neq(tgt[:mask_length] , best_path))
     cost = best_path_score + max_margin - tgt_score
     return T.switch(T.lt(cost , T.alloc(numpy.float32(0.)))
                     , T.alloc(numpy.float32(0.))
                     , cost
                     ),best_path
Esempio n. 60
0
  def __init__(self, config, loss, params):
    self._lr = get_shared_floatX(config.learning_rate, 'lr')
    self._t = get_shared_floatX(1, 't')
    self._all_m_tm1 = []
    self._all_v_tm1 = []
    self._updates = [(self._t, self._t + 1)]

    if config.lr_decay:
      lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq)
      self._updates.append((self._lr, lr_coef * config.learning_rate))

    grads = theano.grad(loss, params)
    #grads = theano.grad(loss, params, disconnected_inputs='ignore')

    self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads])))
    if config.max_grad_norm:
      global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm),
        cast_floatX_np(1.),
        cast_floatX(config.max_grad_norm/self._global_grad_norm))
      # global_clip_factor = tt.minimum(cast_floatX(config.max_grad_norm/self._global_grad_norm), cast_floatX_np(1))
      grads = [global_clip_factor * g for g in grads]

    lr_t = self._lr * \
      clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t))

    for p, g in zip(params, grads):
        m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name)
        v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name)
        self._all_m_tm1.append(m_tm1)
        self._all_v_tm1.append(v_tm1)
        m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g
        v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g)
        delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps)
        p_t = p + delta_t
        self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]