Example #1
0
    def make_w_updates(self, loss, params):
        w_updates = OrderedDict()
        
        params_tilde = [theano.shared(x.get_value()) for x in params] 
        loss_tilde = theano.clone(loss, replace=zip(params, params_tilde))

        grads = theano.grad(loss, params)
        grads_tilde = theano.grad(loss_tilde, params_tilde)

        it_num = theano.shared(np.cast['int16'](0))
        it = it_num + 1

        for param, grad, mu, param_tilde, grad_tilde in zip(params, grads, self.mu, params_tilde, grads_tilde):
#            new_param = param - self.learning_rate * (grad - grad_tilde + mu)

            new_param = param - (1. / self.L) * (grad - grad_tilde + mu)
            w_updates[param] = new_param
            w_updates[param_tilde] = ifelse(T.eq(it % self.m, 0), new_param, param_tilde)
            
            w_updates[self.counted_gradient] = self.counted_gradient + 2
        
        if self.adaptive:
            w_updates[self.L] = self.L / 2

        self.it_num = it_num
        
        w_updates[it_num] = it
        return w_updates
Example #2
0
    def test_prod_no_zeros_in_input(self):
        x = theano.tensor.dmatrix()
        x_val = numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
        pwz = Prod(axis=1, no_zeros_in_input=True)(x)
        fn = theano.function([x], pwz, mode=self.mode)

        assert numpy.allclose(fn(x_val), [6, 120, 504])

        pwz = Prod(no_zeros_in_input=True)(x)
        g = theano.grad(pwz, x)
        gg = theano.grad(g.sum(), x)
        fn = theano.function([x], g, mode=self.mode)
        assert numpy.allclose(fn(x_val),
                              [[362880., 181440., 120960.],
                               [90720., 72576., 60480.],
                               [51840., 45360., 40320.]])
        fn = theano.function([x], gg, mode=self.mode)
        assert numpy.allclose(fn(x_val),
                              [[663696., 422568., 301872.],
                               [233964., 190800., 161016.],
                               [139248., 122652., 109584.]])
        unittest_tools.verify_grad(Prod(axis=1, no_zeros_in_input=True),
                                   [x_val],
                                   mode=self.mode)
        unittest_tools.verify_grad(Prod(no_zeros_in_input=True), [x_val],
                                   mode=self.mode)

        def second_deriv(x):
            return theano.grad(Prod(no_zeros_in_input=True)(x), x)
        unittest_tools.verify_grad(second_deriv, [x_val],
                                   mode=self.mode)
Example #3
0
    def test_grad_types(self):
        # This function simply tests the behaviour of the AbstractConv
        # Ops, not their optimizations
        cpu_input = tensor.ftensor4()
        cpu_filters = tensor.ftensor4()
        cpu_topgrad = tensor.ftensor4()
        gpu_input = gpu_ftensor4()
        gpu_filters = gpu_ftensor4()
        gpu_topgrad = gpu_ftensor4()

        out_shape = tensor.lvector()

        # Check the gradient of the forward conv2d
        for input, filters in itertools.product((cpu_input, gpu_input), (cpu_filters, gpu_filters)):
            output = conv.conv2d(input, filters)
            grad_input, grad_filters = theano.grad(output.sum(), wrt=(input, filters))
            assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type)
            assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type)

        # Check the gradient of gradweight
        for input, topgrad in itertools.product((cpu_input, gpu_input), (cpu_topgrad, gpu_topgrad)):
            grad_filters = conv.AbstractConv2d_gradWeights()(input, topgrad, out_shape)
            grad_input, grad_topgrad = theano.grad(grad_filters.sum(), wrt=(input, topgrad))

            assert grad_input.type == input.type, (grad_input, grad_input.type, input, input.type)
            assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)

        # Check the gradient of gradinputs
        for filters, topgrad in itertools.product((cpu_filters, gpu_filters), (cpu_topgrad, gpu_topgrad)):
            grad_input = conv.AbstractConv2d_gradInputs()(filters, topgrad, out_shape)
            grad_filters, grad_topgrad = theano.grad(grad_input.sum(), wrt=(filters, topgrad))

            assert grad_filters.type == filters.type, (grad_filters, grad_filters.type, filters, filters.type)
            assert grad_topgrad.type == topgrad.type, (grad_topgrad, grad_topgrad.type, topgrad, topgrad.type)
Example #4
0
 def test_fill_grad(self):
     # Fix bug reported at
     # https://groups.google.com/d/topic/theano-users/nQshB8gUA6k/discussion
     x = TensorType(config.floatX, [0, 1, 0])('x')
     y = TensorType(config.floatX, [0, 1, 0])('y')
     e = tensor.second(x, y)
     theano.grad(e.sum(), y)
Example #5
0
def test_dnn_conv_merge():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
    out = T.ftensor4()

    b = 1
    c = 4
    f = 3
    ih = 5
    iw = 8
    kh = 2
    kw = 6
    img_val = numpy.random.random((b, c, ih, iw)).astype("float32")
    kern_val = numpy.random.random((f, c, kh, kw)).astype("float32")
    out_val = numpy.random.random((b, f, ih - kh + 1, iw - kw + 1)).astype("float32")

    conv = dnn.dnn_conv(img, kern)
    gw = theano.grad(conv.sum(), kern)
    gi = theano.grad(conv.sum(), img)

    lr = numpy.asarray(0.05, dtype="float32")

    if cuda.dnn.version() == -1:
        # Can't merge alpha with cudnn v1
        fr = conv + out
        wr = kern + gw
        ir = img + gi
    else:
        fr = lr * (conv + out)
        wr = kern + lr * gw
        ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv)
    assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW)
    assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI)

    mode = mode_with_gpu
    mode = mode.excluding("local_dnn_conv_alpha_merge")
    mode = mode.excluding("local_dnn_convw_alpha_merge")
    mode = mode.excluding("local_dnn_convi_alpha_merge")
    mode = mode.excluding("local_dnn_conv_output_merge")
    mode = mode.excluding("local_dnn_convw_output_merge")
    mode = mode.excluding("local_dnn_convi_output_merge")

    f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)

    assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv)
    assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW)
    assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI)

    out_f1 = f1(img_val, kern_val, out_val)
    out_f2 = f2(img_val, kern_val, out_val)

    assert len(out_f1) == len(out_f2)

    for v1, v2 in zip(out_f1, out_f2):
        utt.assert_allclose(v1, v2)
    def update_opt(
        self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs
    ):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params)
        flat_grad = ext.flatten_tensor_variables(grads)

        constraint_grads = theano.grad(constraint_term, wrt=params)
        xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
        Hx_plain_splits = TT.grad(TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]), wrt=params)
        Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        if self._debug_nan:
            from theano.compile.nanguardmode import NanGuardMode

            mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        else:
            mode = None

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", mode=mode
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", mode=mode
            ),
            f_Hx_plain=lambda: ext.compile_function(
                inputs=inputs + extra_inputs + xs, outputs=Hx_plain, log_name="f_Hx_plain", mode=mode
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", mode=mode
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", mode=mode
            ),
        )
Example #7
0
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda.CudaNdarrayType(
        broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(
        broadcastable=[sh == 1 for sh in npy_kern.shape])()

    # TODO: also test custom pad values
    corr_op = op(mode, subsample)(i, k)
    # try to compile reference implementation without shape,
    # so we don't have to compile hundreds of versions
    conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1],
                                 border_mode=mode, subsample=subsample)
    try:
        conv_op_di = theano.grad(conv_op.sum(), i)
        conv_op_dk = theano.grad(conv_op.sum(), k)
    except Exception:
        # compile with shape information only when needed
        conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1],
                                     ishape, kshape, mode, subsample)
    conv_op_di = theano.grad(conv_op.sum(), i)
    conv_op_dk = theano.grad(conv_op.sum(), k)
    corr_op_di = theano.grad(corr_op.sum(), i)
    corr_op_dk = theano.grad(corr_op.sum(), k)
    outputs = [corr_op, conv_op,
               corr_op_di, conv_op_di,
               corr_op_dk, conv_op_dk]
    try:
        conv_op_dik = theano.grad(conv_op_di.sum(), k)
        conv_op_dki = theano.grad(conv_op_dk.sum(), i)
        corr_op_dik = theano.grad(corr_op_di.sum(), k)
        corr_op_dki = theano.grad(corr_op_dk.sum(), i)
        outputs.extend([corr_op_dik, conv_op_dik,
                        corr_op_dki, conv_op_dki])
    except Exception:
        # skip if the reference implementation can't do it
        pass

    f = theano.function([i, k], outputs, mode=theano_mode.excluding('conv_dnn', 'conv_gemm'))

    allvals = f(npy_img, npy_kern)

    for a, b, oa, ob, p in zip(allvals[::2], allvals[1::2],
                               outputs[::2], outputs[1::2],
                               ('top', 'dtop/dbottom', 'dtop/dweight',
                                'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]

        assert_allclose(a, b, rtol=1e-4)
Example #8
0
        def oneStep(w):
            t = rng.choice(size=(1,), a=n)

            loss_part_tilde = objective(getpred(data[t], param), target[t])
            loss_part_tilde = loss_part_tilde.mean()
            g_tilde = theano.grad(loss_part_tilde, param)
        
            loss_part = objective(getpred(data[t], w), target[t])
            loss_part = loss_part.mean()
            g = theano.grad(loss_part, w)

            w = w - learning_rate * (g - g_tilde + mu)
            return w
Example #9
0
def test_normal_logEI():
    #rng = np.random.RandomState(123)

    N = 2000
    thresh = np.linspace(-10, 50, N)
    #N = 100
    #thresh = np.linspace(37, 38, N)
    mean = thresh * 0
    var = thresh * 0 + 1

    s_t, s_m, s_v = theano.tensor.dvectors('tmv')

    fn = theano.function([s_t, s_m, s_v],
                         gpr_math.s_normal_logEI(s_t, s_m, s_v))

    if 0:
        #print zip(thresh, fn(thresh, mean, var))
        #print 
        a = theano.tensor.dvector()
        y = s_t ** 2 * a[2] + s_t * a[1] + a[0]
        cost = ((y - gpr_math.s_normal_logEI(s_t, s_m, s_v)) ** 2).sum()
        da = theano.grad(cost, a)
        foo = theano.function([a, s_t, s_m, s_v], [cost, da])
        res = scipy.optimize.minimize(foo, [0, -1, -1], jac=True,
                                      args=(thresh, mean, var),
                                      method='L-BFGS-B')
        print res.x

    from hyperopt.criteria import logEI_gaussian
    if 0:
        import matplotlib.pyplot as plt
        y = fn(thresh, mean, var)
        z = logEI_gaussian(mean, var, thresh)
        plt.plot(thresh, y)
        plt.plot(thresh, z)
        plt.show()

    # -- the gpr_math logEI uses a quadratic approximation for very
    #    hopeless points, which gives the right derivative, but the
    #    slightly wrong value
    assert np.allclose(logEI_gaussian(mean, var, thresh),
                       fn(thresh, mean, var),
                       atol=1e-3, rtol=1e-4)

    if 0:
        d_t = theano.grad(gpr_math.s_normal_logEI(s_t, s_m, s_v).sum(), s_t)
        d_fn = theano.function([s_t, s_m, s_v], d_t)

        import matplotlib.pyplot as plt
        plt.plot(thresh, d_fn(thresh, mean, var))
        plt.show()
Example #10
0
 def get_gradients(self):
     dot = theano.dot
     _dO = theano.grad(self.netS, self.outputs)
     _b2 = T.sum(_dO, axis=0)
     H = self.layers[-3]
     _dW2 = dot(H.T, _dO)
     _dH = dot(_dO, self.seg.params["W2"].T)
     I = self.layers[0]
     _dA = _dH * (H - H * H)
     _b1 = T.sum(_dA, axis=0)
     _dW1 = dot(I.T, _dA)
     _I = dot(_dA, self.seg.params["W1"].T)
     _C = theano.grad(T.sum(I * _I), self.seg.params["C"])
     return [_C, _dW1, _b1, _dW2, _b2]
Example #11
0
 def _collins_grad(scores):
     trans_p = [self.params["A"]]
     net_p = [p for k, p in self.params.items() if k != "A"]
     net_S = [ns for ns, ts in scores]
     trans_S = [ts for ns, ts in scores]
     # transition score updates
     transg = [theano.grad(S, trans_p) for S in trans_S]
     trans_grad = [sum([transg[i][j] for i in range(len(transg))]) / self.batchsize for j in range(len(trans_p))]
     trans_upd = [(p, p + self.alfa[p].getupdate(g)) for p, g in zip(trans_p, trans_grad)]
     # network parameters update
     netsg = [theano.grad(S, net_p) for S in net_S]
     net_grad = [sum([netsg[i][j] for i in range(len(netsg))]) / self.batchsize for j in range(len(net_p))]
     # net_grad = [theano.grad(net_S[i], p) for p in net_p]
     net_upd = [(p, p + self.alfa[p].getupdate(g)) for p, g in zip(net_p, net_grad)]
     return trans_upd + net_upd
Example #12
0
def get_or_compute_grads(loss_or_grads, params, regularizers={}):
    """Helper function returning a list of gradients.

    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to return the gradients for
    regularizers : dict 
        'c' : clip_norm(g, c, n)
        'func' : l2 or l1
    Returns
    -------
    list of expressions
        If `loss_or_grads` is a list, it is assumed to be a list of
        gradients and returned as is, unless it does not match the length
        of `params`, in which case a `ValueError` is raised.
        Otherwise, `loss_or_grads` is assumed to be a cost expression and
        the function returns `theano.grad(loss_or_grads, params)`.
    """
    if isinstance(loss_or_grads, list):
        if not len(loss_or_grads) == len(params):
            raise ValueError("Got %d gradient expressions for %d parameters" %
                             (len(loss_or_grads), len(params)))
        return loss_or_grads
    else:
        c = regularizers.get('c', 0.0)
        regularizers_funcs = regularizers.get('func', [])
        if len(regularizers_funcs) == 0 and c == 0.0:
            return theano.grad(loss_or_grads, params)
        else:

            grads = theano.grad(loss_or_grads, params)
            # Max-Norm
            if c > 0:
                norm = T.sqrt(sum([T.sum(g**2) for g in grads]))
                grads = [clip_norm(g, c, norm) for g in grads]

            new_grads = []
            for p, g, r in zip(params, grads, regularizers_funcs):
                if r is None:
                    new_grads.append(g)
                else:
                    # L1 or L2 func
                    new_grads.append(r(g, p))

            return new_grads
Example #13
0
def adam(loss, all_params, learn_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
    """ADAM update rules

    Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf
    """
    updates = []
    all_grads = theano.grad(loss, all_params)
    alpha = learn_rate
    t = theano.shared(np.float32(1.))
    b1_t = b1 * gamma ** (t - 1.)   # decay the first moment running average coefficient

    for theta_prev, g in zip(all_params, all_grads):
        m_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX))
        v_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX))

        m = b1_t * m_prev + (1. - b1_t) * g  # update biased first moment estimate
        v = b2 * v_prev + (1. - b2) * g ** 2  # update biased second raw moment estimate
        m_hat = m / (1. - b1 ** t)  # compute bias-corrected first moment estimate
        v_hat = v / (1. - b2 ** t)  # compute bias-corrected second raw moment estimate
        theta = theta_prev - (alpha * m_hat) / (T.sqrt(v_hat) + e)  # update parameters

        updates.append((m_prev, m))
        updates.append((v_prev, v))
        updates.append((theta_prev, theta) )
    updates.append((t, t + 1.))
    return updates
	def __init__(self,
				 word_vec_width,
				 batch_size,
				 num_hidden,
				 learning_rate=0.1):
		self.num_hidden = num_hidden
		self.learning_rate = learning_rate
		self.word_vec_width = word_vec_width
		self.batch_size = batch_size

		self.vocab_mat = T.fmatrix('vocab')
		self.word_onehot = T.fmatrix('word_onehot')
		b = T.fvector('b')
		W = T.fmatrix('W')
		f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b))))
		s = T.sum(f)

		self.exec_fn = theano.function(
			[self.word_onehot, b, W, self.vocab_mat],
			f,
			allow_input_downcast=True)

		self.word_onehot_c = T.fmatrix('word_onehot_c')
		f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b)))
		s_c = T.sum(f_c)

		J = T.largest(0, 1 - s + s_c)
		self.grad = theano.grad(J, [b, W, self.vocab_mat])

		self.grad_fn = theano.function(
			[self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat],
			self.grad,
			allow_input_downcast=True)
Example #15
0
	def __init__(self, theano_mat, input_vars, cost, learning_rate=1e-2, delta=1e-2):
		""" cost should be a theano variable that this var should take gradients wrt
			input_vars should be a list of variables for whic you'll provide values when you call update()
		"""
		print '[AdaGradParam init]', theano_mat.name, 'has learning rate of', learning_rate
		#print '[AdaGradParam init] tvar.type =', theano_mat.type
		self.tvar = theano_mat	# should be a theano.shared
		self.gg = theano.shared(np.ones_like(self.tvar.get_value(), dtype=theano_mat.dtype) * delta)

		# TODO upgrade to >=0.6rc5 and switch to float32s

		# there is a bug that has been fixed by 0.6rc5 where when you
		# multiply a variable with dtype='float32' with a theano.tensor.constant,
		# you get something back with dtype='float64'
		# if you get errors in this code, that is almost certainly why (unless your on >=0.6rc5)
		self.lr = T.constant(learning_rate)

		grad = theano.grad(cost=cost, wrt=self.tvar)

		gg_update = self.gg + (grad ** 2)
		tvar_update = self.tvar - self.lr * grad / (self.gg ** 0.5)
		#print '[AdaGradParam] gg_update.type =', gg_update.type
		#print '[AdaGradParam] tvar_update.type =', tvar_update.type
		self.updates = [(self.gg, gg_update), (self.tvar, tvar_update)]
		self.f_update = theano.function(input_vars, grad, updates=self.updates)
Example #16
0
    def test_grad(self):
        eps = 1e-7
        f, args, vals = self.get_args()
        output0 = f(*vals)

        # Go through and backpropagate all of the gradients from the outputs
        grad0 = []
        for i in range(len(output0) - 2):
            grad0.append([])
            for j in range(output0[i].size):
                ind = np.unravel_index(j, output0[i].shape)

                g = theano.function(
                    args, theano.grad(self.op(*args)[i][ind], args))
                grad0[-1].append(g(*vals))

        # Loop over each input and numerically compute the gradient
        for k in range(len(vals)):
            for l in range(vals[k].size):
                inner = np.unravel_index(l, vals[k].shape)
                vals[k][inner] += eps
                plus = f(*vals)
                vals[k][inner] -= 2*eps
                minus = f(*vals)
                vals[k][inner] += eps

                # Compare to the backpropagated gradients
                for i in range(len(output0) - 2):
                    for j in range(output0[i].size):
                        ind = np.unravel_index(j, output0[i].shape)
                        delta = 0.5 * (plus[i][ind] - minus[i][ind]) / eps
                        ref = grad0[i][j][k][inner]
                        assert np.abs(delta - ref) < 2*eps, \
                            "{0}".format((k, l, i, j, delta, ref, delta-ref))
Example #17
0
def adam_v2(loss, all_params, learning_rate=0.0002, beta1=0.1, beta2=0.001, epsilon=1e-8, l_decay=1 - 1e-8):
    """
    Adam update rule by Kingma and Ba, ICLR 2015, version 2 (with momentum decay).

    learning_rate: alpha in the paper, the step size

    beta1: exponential decay rate of the 1st moment estimate
    beta2: exponential decay rate of the 2nd moment estimate
    l_decay: exponential increase rate of beta1
    """
    all_grads = theano.grad(loss, all_params)
    updates = []

    for param_i, grad_i in zip(all_params, all_grads):
        t = theano.shared(1) # timestep, for bias correction
        mparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) # 1st moment
        vparam_i = theano.shared(np.zeros(param_i.get_value().shape, dtype=theano.config.floatX)) # 2nd moment

        beta1_current = 1 - (1 - beta1) * l_decay ** (t.astype(theano.config.floatX) - 1)
        m = beta1_current * grad_i + (1 - beta1_current) * mparam_i # new value for 1st moment estimate
        v = beta2 * T.sqr(grad_i) + (1 - beta2) * vparam_i # new value for 2nd moment estimate
        
        m_unbiased = m / (1 - (1 - beta1) ** t.astype(theano.config.floatX))
        v_unbiased = v / (1 - (1 - beta2) ** t.astype(theano.config.floatX))
        w = param_i - learning_rate * m_unbiased / (T.sqrt(v_unbiased) + epsilon) # new parameter values

        updates.append((mparam_i, m))
        updates.append((vparam_i, v))
        updates.append((t, t + 1))
        updates.append((param_i, w))

    return updates
Example #18
0
def build_updates_with_micro(loss, all_params, learning_rate,  beta1=0.1, beta2=0.001,
                    epsilon=1e-8):
    """ Adam update rule by Kingma and Ba, ICLR 2015. """
    all_grads = theano.grad(loss, all_params)
    updates, micro_updates = [], []
    # all_grads = nn.updates.total_norm_constraint(all_grads, 1)
    t = theano.shared(1) # timestep, for bias correction
    for param_i, grad_i in zip(all_params, all_grads):
        zeros = np.zeros(param_i.get_value(borrow=True).shape, dtype=theano.config.floatX)
        mparam_i = theano.shared(zeros) # 1st moment
        vparam_i = theano.shared(zeros.copy()) # 2nd moment
        sum_grad_i = theano.shared(zeros.copy())

        micro_updates.append((sum_grad_i, sum_grad_i+grad_i))

        grad = sum_grad_i / np.float32(mini_batch_size//batch_size)
        m = beta1 * grad + (1 - beta1) * mparam_i # new value for 1st moment estimate
        v = beta2 * T.sqr(grad) + (1 - beta2) * vparam_i # new value for 2nd moment estimate

        m_unbiased = m / (1 - (1 - beta1) ** t.astype(theano.config.floatX))
        v_unbiased = v / (1 - (1 - beta2) ** t.astype(theano.config.floatX))
        w = param_i - learning_rate * m_unbiased / (T.sqrt(v_unbiased) + epsilon) # new parameter values

        updates.append((mparam_i, m))
        updates.append((vparam_i, v))
        updates.append((param_i, w))
        updates.append((sum_grad_i, zeros.copy()))
    updates.append((learning_rate, learning_rate * (1-learning_rate_decay)))
    updates.append((t, t + 1))

    return updates, micro_updates
Example #19
0
def get_or_compute_grads(loss_or_grads, params):
    """Helper function returning a list of gradients.

    Parameters
    ----------
    loss_or_grads : symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params : list of shared variables
        The variables to return the gradients for

    Returns
    -------
    list of expressions
        If `loss_or_grads` is a list, it is assumed to be a list of
        gradients and returned as is, unless it does not match the length
        of `params`, in which case a `ValueError` is raised.
        Otherwise, `loss_or_grads` is assumed to be a cost expression and
        the function returns `theano.grad(loss_or_grads, params)`.
    """
    if isinstance(loss_or_grads, list):
        if not len(loss_or_grads) == len(params):
            raise ValueError("Got %d gradient expressions for %d parameters" %
                             (len(loss_or_grads), len(params)))
        return loss_or_grads
    else:
        return theano.grad(loss_or_grads, params)
 def forward_jacobian_log_det(self, x):
     dy_dx, _ = th.scan(lambda x_i: th.grad(self.forward_func(x_i), x_i),
                        sequences=[x.flatten()])
     if self.fudge != 0.:
         return tt.log(dy_dx + self.fudge).sum()
     else:
         return tt.log(dy_dx).sum()
Example #21
0
def gen_updates_sgd(loss, all_parameters, learning_rate):
  
    all_grads = [theano.grad(loss, param) for param in all_parameters]
    updates = []
    for param_i, grad_i in zip(all_parameters, all_grads):
        updates.append((param_i - param_i * learning_rate * grad_i))
    return updates
 def forward_jacobian_log_det(self, x):
     y_sum = self.forward_map(x).sum()
     dy_dx = th.grad(y_sum, x)
     if self.fudge != 0.:
         return tt.log(dy_dx + self.fudge).sum()
     else:
         return tt.log(dy_dx).sum()
Example #23
0
def custom_svrg2(loss, params, m, learning_rate=0.01, objective=None, data=None, target=None, getpred=None):

    theano.pp(loss)
    
    grads = theano.grad(loss, params)
    n = data.shape[0]

    updates = OrderedDict()
    rng = T.shared_randomstreams.RandomStreams(seed=149)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)        
        mu = grad / n

        def oneStep(w):
            t = rng.choice(size=(1,), a=n)

            loss_part_tilde = objective(getpred(data[t], param), target[t])
            loss_part_tilde = loss_part_tilde.mean()
            g_tilde = theano.grad(loss_part_tilde, param)
        
            loss_part = objective(getpred(data[t], w), target[t])
            loss_part = loss_part.mean()
            g = theano.grad(loss_part, w)

            w = w - learning_rate * (g - g_tilde + mu)
            return w

        w_tilde, scan_updates = theano.scan(fn=oneStep, outputs_info=param, n_steps=m)

        updates.update(scan_updates)
        updates[param] = w_tilde[-1]

    return updates
Example #24
0
def adadelta(loss, all_params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    """
    in the paper, no learning rate is considered (so learning_rate=1.0). Probably best to keep it at this value.
    epsilon is important for the very first update (so the numerator does not become 0).

    rho = 0.95 and epsilon=1e-6 are suggested in the paper and reported to work for multiple datasets (MNIST, speech).

    see "Adadelta: an adaptive learning rate method" by Matthew Zeiler for more info.
    """
    all_grads = [theano.grad(loss, param) for param in all_params]
    all_accumulators = [theano.shared(np.zeros(param.get_value().shape, dtype=theano.config.floatX)) for param in all_params]
    all_delta_accumulators = [theano.shared(np.zeros(param.get_value().shape, dtype=theano.config.floatX)) for param in all_params]

    # all_accumulators: accumulate gradient magnitudes
    # all_delta_accumulators: accumulate update magnitudes (recursive!)

    updates = []
    for param_i, grad_i, acc_i, acc_delta_i in zip(all_params, all_grads, all_accumulators, all_delta_accumulators):
        acc_i_new = rho * acc_i + (1 - rho) * grad_i**2
        updates.append((acc_i, acc_i_new))

        update_i = grad_i * T.sqrt(acc_delta_i + epsilon) / T.sqrt(acc_i_new + epsilon) # use the 'old' acc_delta here
        updates.append((param_i, param_i - learning_rate * update_i))

        acc_delta_i_new = rho * acc_delta_i + (1 - rho) * update_i**2
        updates.append((acc_delta_i, acc_delta_i_new))

    return updates
Example #25
0
 def get_partial_diff(self, differentiable_var_name):
     diff_var = self.var_lookup[differentiable_var_name]
     grad = theano.function(self.variables,
                            theano.grad(self.output_expression,
                                        diff_var),
                            allow_input_downcast=True)
     return self.f, grad
Example #26
0
  def __init__(self, config, loss, params):
    self._lr = get_shared_floatX(config.learning_rate, 'lr')
    self._t = get_shared_floatX(1, 't')
    self._all_m_tm1 = []
    self._all_v_tm1 = []
    self._updates = [(self._t, self._t + 1)]

    if config.lr_decay:
      lr_coef = tt.pow(config.lr_decay, (self._t - 1) // config.lr_decay_freq)
      self._updates.append((self._lr, lr_coef * config.learning_rate))

    grads = theano.grad(loss, params)

    self._global_grad_norm = tt.sqrt(tt.sum(tt.stack([tt.sum(g**2.) for g in grads])))
    if config.max_grad_norm:
      global_clip_factor = ifelse(tt.lt(self._global_grad_norm, config.max_grad_norm),
        cast_floatX_np(1.),
        cast_floatX(config.max_grad_norm/self._global_grad_norm))
      grads = [global_clip_factor * g for g in grads]

    lr_t = self._lr * \
      clip_sqrt(1 - tt.pow(config.adam_beta2, self._t)) / (1 - tt.pow(config.adam_beta1, self._t))

    for p, g in zip(params, grads):
        m_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_m_' + p.name)
        v_tm1 = get_shared_floatX(np.zeros_like(p.get_value()), 'adam_v_' + p.name)
        self._all_m_tm1.append(m_tm1)
        self._all_v_tm1.append(v_tm1)
        m_t = config.adam_beta1 * m_tm1 + (1-config.adam_beta1) * g
        v_t = config.adam_beta2 * v_tm1 + (1-config.adam_beta2) * tt.sqr(g)
        delta_t = -lr_t * m_t / (clip_sqrt(v_t) + config.adam_eps)
        p_t = p + delta_t
        self._updates += [(m_tm1, m_t), (v_tm1, v_t), (p, p_t)]
    def gradients_and_updates(self, grad_normalize):
        """Compute gradients (t_gparams) using cost and trainable weights (t_params).
        """

        # ------ Compute gradient parameters
        self.t_gparams = OrderedDict({'g_' + k: theano.grad(cost=self.t_outputs['T_cost'], wrt=p)
                                      for k, p in self.t_params.iteritems()})

        # ------ Compute norm and stack it like a vector (to analyze outside)
        # self.out_debug = self.t_gparams['g_T_B']
        self.out_gnorm = T.stack([T.sqrt(T.sum(gp ** 2)) for gp in self.t_gparams.values()])

        # ------ Normalize gradients
        self.g_norm = {}
        if grad_normalize.has_key('max_norm'):      # maximum gradient norm limited
            mn = grad_normalize['max_norm']
            for k in self.t_gparams.keys():
                self.g_norm[k] = T.sqrt(T.sum(self.t_gparams[k] ** 2))
                self.t_gparams[k] = ifel(T.gt(self.g_norm[k], mn),
                                         mn * self.t_gparams[k] / (self.g_norm[k] + 1e-6),
                                         self.t_gparams[k])

        # ------ Update parameters (SGD!)
        self.update_params = []
        for k in self.t_params.keys():
            self.update_params.append([self.t_params[k],
                                       self.t_params[k] - self.t_inputs['T_lr'] * self.t_gparams['g_' + k]])
Example #28
0
    def _training_updates(self, **kwargs):
        """Returns the update expression for updating the model parameters
        during training. The formula for updating an argument is
            
        .. math:
            
           \theta^{(k+1)} = \theta^{(k)} - learning\_rate * \frac{\partial cost}{\partial \theta} 

        Expects a 'learning_rate' and 'cost' kwarg.
            
        :type learning_rate: theano.config.floatX
        :param learning_rate: The learning rate for parameter updates.
                                  
        :type cost: theano.tensor.TensorType
        :param cost: The cost function of which we are computing
                     the gradient.
                         
        :returns: A list of pairs (parameter, update_expression), to
                  be passed directly to ``theano.function`` as the
                  ``updates`` parameter.
        """
        utils.check_kwargs(kwargs, ['learning_rate', 'cost'])

        learning_rate = kwargs['learning_rate']
        bound_cost = kwargs['cost']

        updates = []
        for param in self.params:
            gradient = theano.grad(cost = bound_cost, wrt = param)
            updates.append((param, param - learning_rate * gradient))

        return updates
Example #29
0
	def _get_rmsprop_updates(self, loss, params, lr, grad_momentum
							, sqr_momentum, min_grad):
		# Modified from the Lasagne package:
		# 	https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py

		grads = theano.grad(loss, params)
		scale_factor = 1.0
		if self.max_norm > 0:
			scale_factor = self._clip_gradient_norm(grads, self.max_norm)
		updates = OrderedDict()

		# Using theano constant to prevent upcasting of float32
		one = T.constant(1)
		for param, grad in zip(params, grads):
			value = param.get_value(borrow=True)
			accu_sqr = theano.shared(np.zeros(value.shape, dtype=value.dtype),
				broadcastable=param.broadcastable)
			accu_sqr_new = sqr_momentum * accu_sqr + \
							(one - sqr_momentum) * grad ** 2

			accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
				broadcastable=param.broadcastable)
			accu_new = grad_momentum * accu + (one - grad_momentum) * grad

			updates[accu] = accu_new
			updates[accu_sqr] = accu_sqr_new
			updates[param] = param - (lr * grad * scale_factor /
				T.sqrt(accu_sqr_new - accu_new ** 2 + min_grad))
		return updates
Example #30
0
def prior_dlogp(vars, model, flat_view):
    """Returns the gradient of the prior on the parameters as a vector of size D x 1"""
    terms = tt.concatenate(
        [theano.grad(var.logpt, var).flatten() for var in vars], axis=0)
    dlogp = theano.clone(terms, flat_view.replacements, strict=False)

    return dlogp
_theano_rng = RandomStreams(config.seed // 2 +
                            321)  # generates random numbers directly on GPU
flat_probs, params, rhn_updates, hidden_states = stacked.model(
    _input_data, _noise_x, _lr, _is_training, config, _theano_rng)

# loss
_targets = T.imatrix('targets')  # (batch_size, num_steps)
flat_targets = _targets.T.flatten()
xentropies = T.nnet.categorical_crossentropy(
    flat_probs, flat_targets)  # (batch_size * num_steps,)
pred_loss = xentropies.sum() / config.batch_size
l2_loss = 0.5 * T.sum(T.stack([T.sum(p**2) for p in params]))  # regularization
loss = pred_loss + config.weight_decay * l2_loss

# compute gradients
grads = theano.grad(loss, params)
global_grad_norm = T.sqrt(T.sum(T.stack([T.sum(g**2) for g in grads
                                         ])))  # gradient clipping
clip_factor = theano.ifelse.ifelse(
    global_grad_norm < config.max_grad_norm, cast_floatX(1),
    T.cast(config.max_grad_norm / global_grad_norm, theano.config.floatX))

param_updates = [(p, p - _lr * clip_factor * g) for p, g in zip(params, grads)]
num_params = np.sum([param.get_value().size for param in params])

train = theano.function([_input_data, _targets, _noise_x],
                        loss,
                        givens={_is_training: np.int32(1)},
                        updates=rhn_updates + param_updates)

evaluate = theano.function(
Example #32
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   extra_inputs=None,
                   constraint_name="constraint",
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params, disconnected_inputs='warn')
        flat_grad = ext.flatten_tensor_variables(grads)

        self._hvp_approach.update_opt(f=constraint_term,
                                      target=target,
                                      inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
            ),
        )
Example #33
0
def test_pooling3d():
    # CuDNN 3d pooling requires CuDNN v3. Don't test if the CuDNN version is
    # too old.
    if not cuda.dnn.dnn_available() or cuda.dnn.version() < (3000, 3000):
        raise SkipTest(cuda.dnn.dnn_available.msg)

    x = T.TensorType(broadcastable=(False, False, False, False, False),
                     dtype='float32')()
    for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
                             ((0, 0, 0), (1, 0, 0), (0, 1, 0), (0, 0, 1),
                              (2, 3, 2), (3, 2, 2), (2, 2, 3))):
        if mode == 'max':
            func = T.max
        else:
            func = T.mean
        if pad != (0, 0, 0) and cuda.dnn.version() == -1:
            continue

        if pad != (0, 0, 0) and func is T.mean:
            continue

        for ws in (4, 2, 5):
            for stride in (2, 3):
                if stride > ws:
                    continue
                if pad[0] > stride or pad[1] > stride or pad[2] > stride:
                    # Not implemented
                    continue
                out1 = cuda.dnn.dnn_pool(x, (ws, ws, ws),
                                         stride=(stride, stride, stride),
                                         pad=pad, mode=mode)
                out2 = pool3d2d(x, ds=(ws, ws, ws),
                                strides=(stride, stride, stride),
                                pad=pad, pool_func=func)

                # For max pooling pool3d2d explicitly pads the input with
                # -inf. Because of this, the compilation mode for the function
                # that uses pool3d2d should not check for infinite values or
                # it will falsely believe there is a error in the graph.
                mode_without_gpu2 = mode_without_gpu.including()
                mode_without_gpu2.check_isfinite = False

                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
                f2 = theano.function([x], out2, mode=mode_without_gpu2)
                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                                for node in f2.maker.fgraph.apply_nodes])
                for shp in [(1, 10, 100, 100, 100),
                            (1, 3, 99, 99, 99),
                            (32, 1, 147, 197, 37),
                            ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
                    a = f1(data).__array__()

                    b = f2(data).__array__()

                    utt.assert_allclose(a, b,
                                        atol=numpy.finfo(numpy.float32).eps)

        # Test the grad
        for shp in [(1, 1, 2, 2, 2),
                    (1, 1, 3, 3, 3),
                    (1, 1, 3, 3, 4),
                    (1, 1, 3, 4, 3),
                    (1, 1, 4, 3, 3),
                    (1, 1, 4, 4, 4),
                    (1, 1, 5, 5, 5)]:
            data = numpy.random.normal(0, 1, shp).astype("float32") * 10

            ws = 2
            stride = 2
            if pad[0] > stride or pad[1] > stride or pad[2] > stride:
                # Not implemented
                continue

            # Test the GPU grad + GPU implementation
            def fn(x):
                dnn_op = cuda.dnn.dnn_pool(
                    x, ws=(ws, ws, ws),
                    stride=(stride, stride, stride),
                    pad=pad,
                    mode=mode)
                return dnn_op
            theano.tests.unittest_tools.verify_grad(
                fn, [data],
                cast_to_output_type=False,
                mode=mode_with_gpu)
            # Confirm that we get the good op.
            fg = theano.function([x], theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
                        for node in fg.maker.fgraph.toposort()])
            g_out = fg(data)

            # Compare again the CPU result
            out = pool3d2d(x, (ws, ws, ws),
                           strides=(stride, stride, stride),
                           pad=pad, pool_func=func)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            c_out = fc(data)
            assert numpy.allclose(c_out, g_out)
Example #34
0
 def get_opt_output():
     flat_grad = flatten_tensor_variables(theano.grad(
         penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore'
     ))
     return [penalized_loss.astype('float64'), flat_grad.astype('float64')]
Example #35
0
from theano import tensor as T
import theano
import theano.printing

a = T.scalar()

pow = a**2

g = theano.grad(pow, a)

print(theano.printing.debugprint(g))
print(theano.printing.debugprint(theano.function([a], g)))
def flatten_hessian(cost,
                    wrt,
                    consider_constant=None,
                    disconnected_inputs='raise',
                    block_diagonal=True):
    """
    :type cost: Scalar (0-dimensional) Variable.
    :type wrt: Vector (1-dimensional tensor) 'Variable' or list of
               vectors (1-dimensional tensors) Variables

    :param consider_constant: a list of expressions not to backpropagate
        through

    :type disconnected_inputs: string
    :param disconnected_inputs: Defines the behaviour if some of the variables
        in ``wrt`` are not part of the computational graph computing ``cost``
        (or if all links are non-differentiable). The possible values are:
        - 'ignore': considers that the gradient on these parameters is zero.
        - 'warn': consider the gradient zero, and print a warning.
        - 'raise': raise an exception.

    :return: either a instance of Variable or list/tuple of Variables
            (depending upon `wrt`) repressenting the Hessian of the `cost`
            with respect to (elements of) `wrt`. If an element of `wrt` is not
            differentiable with respect to the output, then a zero
            variable is returned. The return value is of same type
            as `wrt`: a list/tuple or TensorVariable in all cases.
    """
    import theano
    from theano.tensor import arange
    # Check inputs have the right format
    import theano.tensor as TT
    from theano import Variable
    from theano import grad
    assert isinstance(cost, Variable), \
        "tensor.hessian expects a Variable as `cost`"
    assert cost.ndim == 0, \
        "tensor.hessian expects a 0 dimensional variable as `cost`"

    using_list = isinstance(wrt, list)
    using_tuple = isinstance(wrt, tuple)

    if isinstance(wrt, (list, tuple)):
        wrt = list(wrt)
    else:
        wrt = [wrt]

    hessians = []
    if not block_diagonal:
        expr = TT.concatenate([
            grad(cost,
                 input,
                 consider_constant=consider_constant,
                 disconnected_inputs=disconnected_inputs).flatten()
            for input in wrt
        ])

    for input in wrt:
        assert isinstance(input, Variable), \
            "tensor.hessian expects a (list of) Variable as `wrt`"
        # assert input.ndim == 1, \
        #     "tensor.hessian expects a (list of) 1 dimensional variable " \
        #     "as `wrt`"
        if block_diagonal:
            expr = grad(cost,
                        input,
                        consider_constant=consider_constant,
                        disconnected_inputs=disconnected_inputs).flatten()

        # It is possible that the inputs are disconnected from expr,
        # even if they are connected to cost.
        # This should not be an error.
        hess, updates = theano.scan(
            lambda i, y, x: grad(y[i],
                                 x,
                                 consider_constant=consider_constant,
                                 disconnected_inputs='ignore').flatten(),
            sequences=arange(expr.shape[0]),
            non_sequences=[expr, input])
        assert not updates, \
            ("Scan has returned a list of updates. This should not "
             "happen! Report this to theano-users (also include the "
             "script that generated the error)")
        hessians.append(hess)
    if block_diagonal:
        from theano.gradient import format_as
        return format_as(using_list, using_tuple, hessians)
    else:
        return TT.concatenate(hessians, axis=1)
Example #37
0
    dtype=theano.config.floatX)
sh_s = theano.shared(
    np.zeros((params['max_sen_length'], params['batch_size']), dtype=np.int32))
sh_mask = theano.shared(
    np.zeros((params['max_sen_length'], params['batch_size']),
             dtype=theano.config.floatX))
sh_w = theano.shared(np.float32(0.0))

print "================= Compiling Theano.functions ====================== "
givens = [(sym_s, sh_s), (sym_mask, sh_mask), (sym_w, sh_w)]
import theano.gradient
s_clip = theano.gradient.grad_clip(sym_s, -10.0,
                                   10.0)  # see graves generating sequences
cost = l_vae.get_cost(params['keep_rate'], params['drop_out'], s_clip,
                      sym_mask, sym_w)
all_grads = theano.grad(cost, all_params)
all_grads, step_norm, multiplier = step_clipping(all_grads,
                                                 threshold=10.0,
                                                 to_zero=False)
if params['opt_function'] == 'adam':
    updates, steps = adam(all_grads, all_params, learning_rate=params['lr'], beta1=params['beta1'],\
                          beta2=params['beta2'],decay_factor=1.0-params['decay_rate'] )
elif params['opt_function'] == 'adagrad':
    updates, steps = adagrad(all_grads, all_params, learning_rate=params['lr'])
else:
    updates, steps = adadelta(all_grads, all_params)

outputs = [cost, step_norm] + l_vae.get_train_results()
train = theano.function([],
                        outputs + [multiplier],
                        givens=givens,
Example #38
0
 def grad_ii(i):
     return theano.grad(f[i], x)[i]
Example #39
0
    def __init__(self,inputs,outputs, cost,scopes, **option):
        """

        :param model:
        :param option:
        """
        if "variables" not in option or not option["variables"]:
            # not fine-tuning

            params = [param for scope in scopes
                      for param in ops.trainable_variables(scope)]
            # regularization_loss = ops.get_regularization_loss(scopes)
            # if regularization_loss:
            #     cost += regularization_loss
            # if option["l2_scale"]:
            #     get_l2 = ops.l2_regularizer(option["l2_scale"])
            #     cost += reduce(T.add, [get_l2(param) for param in params])

        else:
            pass
            # fine-tuning
            # _logger.debug("loading specified params")
            # params = option["variables"]

        grads = theano.grad(cost, params)



        gradsref = grads

        vec = [theano.shared(numpy.zeros_like(p.get_value())) for p in params]

        if "algorithm" not in option:
            option["algorithm"] = "sgd"

        if "variant" not in option:
            option["variant"] = None

        if "constraint" not in option:
            option["constraint"] = None

        if "momentum" not in option:
            option["momentum"] = False

        if "norm" not in option:
            option["norm"] = True

        if "nesterov" not in option:
            option["nesterov"] = False

        if "initialize" not in option:
            option["initialize"] = False

        if "nanguard" not in option:
            option["nanguard"] = False

        algorithm = option["algorithm"]
        variant = option["variant"]
        variant = [variant] if variant != None else []

        if option["norm"]:
            normval = constraint.global_norm(grads)
            outputs = outputs[:]
            outputs.append(normval)

        if option["constraint"]:
            method, value = option["constraint"]
            if method == "value":
                grads = constraint.clip_by_value(grads, value[0], value[1])
            if method == "norm":
                grads = constraint.clip_by_global_norm(grads, value)

        if option["nanguard"]:
            gnorm = constraint.global_norm(gradsref)
            isnan = theano.tensor.isnan(gnorm)
            isinf = theano.tensor.isinf(gnorm)
            notfinite = theano.tensor.or_(isnan, isinf)
            newgrads = []
            for p, g in zip(params, grads):
                newgrads.append(theano.tensor.switch(notfinite, 0.1 * p, g))
            grads = newgrads

        if option["nesterov"]:
            option["momentum"] = False

        gup = []
        scan_updates = ops.get_updates()

        # append update rules
        if isinstance(scan_updates, OrderedDict):
            for key, value in scan_updates.iteritems():
                gup.append((key, value))
        else:
            gup.extend(scan_updates)

        for v, g in zip(vec, grads):
            gup.append((v, g))

        if algorithm == "sgd":
            alpha = theano.tensor.scalar()
            hparams = [alpha]
            defaults = [("alpha", 1.0)]
            svar, pup = updates.sgd_updates(params, vec, *hparams)
        elif algorithm == "adagrad":
            alpha = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, epsilon]
            defaults = [("alpha", 1.0), ("epsilon", 1e-6)]
            svar, pup = updates.adagrad_updates(params, vec, *hparams)
        elif algorithm == "rmsprop":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1e-2), ("rho", 0.99), ("epsilon", 1e-8)]
            rmsparam = hparams + variant
            svar, pup = updates.rmsprop_updates(params, vec, *rmsparam)
        elif algorithm == "rmsprop_momentum":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            momentum = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon, momentum]
            defaults = [("alpha", 1e-4), ("rho", 0.95), ("epsilon", 1e-4)]
            defaults.append(("moment", 0.9))
            svar, pup = updates.rmsprop_momentum_updates(params, vec, *hparams)
        elif algorithm == "adadelta":
            alpha = theano.tensor.scalar()
            rho = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, rho, epsilon]
            defaults = [("alpha", 1.0), ("rho", 0.95), ("epsilon", 1e-6)]
            svar, pup = updates.adadelta_updates(params, vec, *hparams)
        elif algorithm == "adam":
            alpha = theano.tensor.scalar()
            beta1 = theano.tensor.scalar()
            beta2 = theano.tensor.scalar()
            epsilon = theano.tensor.scalar()
            hparams = [alpha, beta1, beta2, epsilon]
            defaults = [("alpha", 0.001), ("beta1", 0.9), ("beta2", 0.999)]
            defaults.append(("epsilon", 1e-8))
            svar, pup = updates.adam_updates(params, vec, *hparams)
        else:
            raise "Error: " + algorithm + " is not supported"

        # restore variables used by optimizer
        if option["initialize"]:
            values = option["initialize"]
            for v1, v2 in zip(svar, values):
                v1.set_value(v2)

        if option["momentum"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        if option["nesterov"]:
            momentum = theano.tensor.scalar()
            hparams.append(momentum)
            defaults.append(("momentum", 0.9))
            pup = updates.apply_momentum(pup, params, momentum)

        optimize = theano.function(inputs, outputs, updates=gup, on_unused_input='warn')
        update = theano.function(hparams, [], updates=pup, on_unused_input='warn')

        def wrapper(**option):
            values = []
            for item in defaults:
                name = item[0]
                val = item[1]
                if name not in option:
                    option[name] = val
                values.append(option[name])
            return update(*values)

        self.optimize = optimize
        self.update = wrapper
        self.option = option
        self.algorithm = algorithm
        self.parameter = svar
Example #40
0
def test_dnn_conv_alpha_output_merge():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)
    img = T.ftensor4()
    kern = T.ftensor4()
    out = T.ftensor4()

    b = 1
    c = 4
    f = 3
    ih = 5
    iw = 8
    kh = 2
    kw = 6
    img_val = numpy.random.random((b, c, ih, iw)).astype('float32')
    kern_val = numpy.random.random((f, c, kh, kw)).astype('float32')
    out_val = numpy.random.random((b, f, ih - kh + 1,
                                   iw - kw + 1)).astype('float32')

    conv = dnn.dnn_conv(img, kern)
    gw = theano.grad(conv.sum(), kern)
    gi = theano.grad(conv.sum(), img)

    lr = numpy.asarray(0.05, dtype='float32')

    if cuda.dnn.version() == -1:
        # Can't merge alpha with cudnn v1
        fr = conv + out
        wr = kern + gw
        ir = img + gi
    else:
        fr = lr * (conv + out)
        wr = kern + lr * gw
        ir = img + lr * gi

    f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu)
    assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
                      dnn.GpuDnnConv)
    assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
                      dnn.GpuDnnConvGradW)
    assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
                      dnn.GpuDnnConvGradI)

    mode = mode_with_gpu
    mode = mode.excluding('local_dnn_conv_alpha_merge')
    mode = mode.excluding('local_dnn_convw_alpha_merge')
    mode = mode.excluding('local_dnn_convi_alpha_merge')
    mode = mode.excluding('local_dnn_conv_output_merge')
    mode = mode.excluding('local_dnn_convw_output_merge')
    mode = mode.excluding('local_dnn_convi_output_merge')

    f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode)

    assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op,
                          dnn.GpuDnnConv)
    assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op,
                          dnn.GpuDnnConvGradW)
    assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op,
                          dnn.GpuDnnConvGradI)

    out_f1 = f1(img_val, kern_val, out_val)
    out_f2 = f2(img_val, kern_val, out_val)

    assert len(out_f1) == len(out_f2)

    for v1, v2 in zip(out_f1, out_f2):
        utt.assert_allclose(v1, v2)
    def setUp(self):
        if (hasattr(keras, '__version__') == False):
            self.keras_version = 0.2  #didn't have the __version__ tag
        else:
            self.keras_version = float(keras.__version__[0:3])

        self.inp = (np.random.randn(10 * 10 * 51 * 51).reshape(10, 10, 51, 51))
        self.keras_model = keras.models.Sequential()
        conv_layer = keras.layers.convolutional.Convolution2D(
            nb_filter=2,
            nb_row=4,
            nb_col=4,
            subsample=(2, 2),
            activation="relu",
            input_shape=(10, 51, 51))
        self.keras_model.add(conv_layer)
        if (self.keras_version > 0.2):
            self.keras_model.add(
                keras.layers.convolutional.MaxPooling2D(pool_size=(4, 4),
                                                        strides=(2, 2)))
            self.keras_model.add(
                keras.layers.convolutional.AveragePooling2D(pool_size=(4, 4),
                                                            strides=(2, 2)))
        else:
            print(self.keras_version)
            self.keras_model.add(
                keras.layers.convolutional.MaxPooling2D(pool_size=(4, 4),
                                                        stride=(2, 2)))
            #There is no average pooling in version 0.2.0
        self.keras_model.add(keras.layers.core.Flatten())
        self.keras_model.add(keras.layers.core.Dense(output_dim=1))
        self.keras_model.add(keras.layers.core.Activation("sigmoid"))
        self.keras_model.compile(loss="mse", optimizer="sgd")
        if (self.keras_version <= 0.3):
            self.keras_output_fprop_func = compile_func(
                [self.keras_model.layers[0].input],
                self.keras_model.layers[-1].get_output(False))
            grad = theano.grad(
                theano.tensor.sum(
                    self.keras_model.layers[-2].get_output(False)[:, 0]),
                self.keras_model.layers[0].input)
            self.grad_func = theano.function(
                [self.keras_model.layers[0].input],
                grad,
                allow_input_downcast=True,
                on_unused_input='ignore')
        else:
            keras_output_fprop_func = compile_func([
                self.keras_model.layers[0].input,
                keras.backend.learning_phase()
            ], self.keras_model.layers[-1].output)
            self.keras_output_fprop_func =\
                lambda x: keras_output_fprop_func(x,False)
            grad = theano.grad(
                theano.tensor.sum(self.keras_model.layers[-2].output[:, 0]),
                self.keras_model.layers[0].input)
            grad_func = theano.function([
                self.keras_model.layers[0].input,
                keras.backend.learning_phase()
            ],
                                        grad,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore')
            self.grad_func = lambda x: grad_func(x, False)
Example #42
0
 def get_output_for(self, inputs, **kwargs):
     input, layer_out, layer_in = inputs
     return theano.grad(None, wrt=layer_in, known_grads={layer_out: input})
Example #43
0
gen_loss = lasagne.objectives.squared_error(fake_out, c).mean() + loss_gen_fm

#igen_loss = lasagne.objectives.binary_crossentropy(fake_out, c).mean() + 5 * loss_gen_fm
'''
# adding extra penalty for MSE contour 
pred_contour = extract_contour_tensor(gen_output)
real_contour = extract_contour_tensor(GAN.input_c)

loss_gen_contour = lasagne.objectives.squared_error(pred_contour, real_contour).mean()
gen_loss += 20 * loss_gen_contour
'''
print 'loss and function setup'

#%
gen_grads = theano.grad(gen_loss, wrt=gen_params)
critic_grads = theano.grad(critic_loss, wrt=critic_params)
gen_grads_norm = sum(T.sum(T.square(grad))
                     for grad in gen_grads) / len(gen_grads)
critic_grads_norm = sum(T.sum(T.square(grad))
                        for grad in critic_grads) / len(critic_grads)

gen_updates = lasagne.updates.rmsprop(gen_grads,
                                      gen_params,
                                      learning_rate=initial_eta)

#gen_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in gen_params]
#gen_avg_updates = [(a,a + 0.0001*(p-a)) for p,a in zip(gen_params,gen_param_avg)]
#gen_updates = gen_avg_updates

critic_updates = lasagne.updates.rmsprop(critic_grads,
Example #44
0
def test_pooling():
    if not cuda.dnn.dnn_available():
        raise SkipTest(cuda.dnn.dnn_available.msg)

    x = T.ftensor4()
    for mode, pad in product(('max', 'average_inc_pad', 'average_exc_pad'),
                             ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))):
        if mode == 'max':
            func = T.max
        else:
            func = T.mean
        if pad != (0, 0) and cuda.dnn.version() == -1:
            continue

        if pad != (0, 0) and func is T.mean:
            continue

        for ws in (4, 2, 5):
            for stride in (2, 3):
                if stride > ws:
                    continue
                if pad[0] > stride or pad[1] > stride:
                    # Not implemented
                    continue
                # We will check that the opt introduced it.
                out1 = max_pool_2d(x, (ws, ws),
                                   st=(stride, stride),
                                   ignore_border=True,
                                   padding=pad, mode=mode)
                out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride),
                                   pad=pad,
                                   pool_function=func)
                mode_without_gpu2 = mode_without_gpu.including()
                mode_without_gpu2.check_isfinite = False
                f1 = theano.function([x], out1, mode=mode_with_gpu)
                assert any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                            for node in f1.maker.fgraph.apply_nodes])
                f2 = theano.function([x], out2, mode=mode_without_gpu2)
                assert not any([isinstance(node.op, cuda.dnn.GpuDnnPool)
                                for node in f2.maker.fgraph.apply_nodes])
                for shp in [(1, 10, 100, 100),
                            (1, 3, 99, 99),
                            (32, 1, 147, 197),
                            ]:
                    data = numpy.random.normal(0, 1, shp).astype("float32")
                    a = f1(data).__array__()

                    b = f2(data).__array__()
                    assert numpy.allclose(a, b,
                                          atol=numpy.finfo(numpy.float32).eps)

        # Test the grad
        for shp in [(1, 1, 2, 2),
                    (1, 1, 3, 3)]:
            data = numpy.random.normal(0, 1, shp).astype("float32") * 10

            ws = 2
            stride = 2
            if pad[0] > stride or pad[1] > stride:
                # Not implemented
                continue

            # This test the CPU grad + opt + GPU implemtentation
            def fn(x):
                return max_pool_2d(x, (ws, ws), ignore_border=True,
                                   padding=pad, mode=mode)
            theano.tests.unittest_tools.verify_grad(fn, [data],
                                                    cast_to_output_type=False,
                                                    mode=mode_with_gpu)
            # Confirm that the opt would have inserted it.
            fg = theano.function([x], theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
                        for node in fg.maker.fgraph.toposort()])

            # Test the GPU grad + GPU implementation
            def fn(x):
                dnn_op = cuda.dnn.dnn_pool(
                    x, ws=(ws, ws),
                    stride=(stride, stride),
                    pad=pad,
                    mode=mode)
                return dnn_op
            theano.tests.unittest_tools.verify_grad(
                fn, [data],
                cast_to_output_type=False,
                mode=mode_with_gpu)
            # Confirm that we get the good op.
            fg = theano.function([x], theano.grad(fn(x).sum(), x),
                                 mode=mode_with_gpu)
            assert any([isinstance(node.op, cuda.dnn.GpuDnnPoolGrad)
                        for node in fg.maker.fgraph.toposort()])
            g_out = fg(data)

            # Compare again the CPU result
            out = max_pool_2d(x, (ws, ws),
                              padding=pad,
                              ignore_border=True, mode=mode)
            fc = theano.function([x], theano.grad(out.sum(), x),
                                 mode=mode_without_gpu)
            if mode == 'max':
                assert any([isinstance(node.op, MaxPoolGrad)
                            for node in fc.maker.fgraph.toposort()])
            else:
                assert any([isinstance(node.op, AveragePoolGrad)
                            for node in fc.maker.fgraph.toposort()])
            c_out = fc(data)
            assert numpy.allclose(c_out, g_out)
Example #45
0
    def computeLosses(self, y, std, regMultiplier, deterministic):

        logitSens = self.hyperp.setdefault('logitSens', 0.)
        logitDiffSens = self.hyperp.setdefault('logitDiffSens', 0.)
        logitSqSens = self.hyperp.setdefault('logitSqSens', 0.)
        probSens = self.hyperp.setdefault('probSens', 0.)
        lossSens = self.hyperp.setdefault('lossSens', 0.)
        l1 = self.hyperp.setdefault('l1', 0.)
        l2 = self.hyperp.setdefault('l2', 0.)

        layers = self.classifier.layers_

        lossFunction = lasagne.objectives.categorical_crossentropy

        aggregate = T.mean  # otherwise lasagne.objectives.aggregate

        outputLayer = layers[-1]
        logitLayer = layers[-2]
        inputLayer = layers[0]
        networkInput = inputLayer.input_var
        networkOutput = get_output(outputLayer, deterministic=deterministic)
        logitOutput = get_output(logitLayer, deterministic=deterministic)

        ######################################################################
        # Very weird thing:
        # lossSensitivity gradients can only be computed if the one-hot encoded
        # version of the loss function is used. BUT that version lacks a
        # stability optimization in Theano that leads to NaNs during training.
        # This is why both versions need to be employed here.

        L = lossFunction(networkOutput, y)

        y_oneHot = lasagne.utils.one_hot(y, outputLayer.output_shape[1])
        L_oneHot = lossFunction(networkOutput, y_oneHot)

        #######################################################################

        classificationLoss = aggregate(L)

        l1Loss = regularization.regularize_layer_params(
            layers.values(), regularization.l1)

        l2Loss = regularization.regularize_layer_params(
            layers.values(), regularization.l2)

        # logit sensitivity
        logit = T.sum(logitOutput * y_oneHot, axis=1)
        G_logit = T.grad(T.sum(logit), networkInput)

        if std is not None:
            G_logit = std * G_logit

        # Sparse logit saliency regularization
        absG_logit = T.abs_(G_logit)
        sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3))
        logitSensLoss = aggregate(sumAbsG_logit)

        # Squared logit saliency regularization
        sqG_logit = G_logit**2
        sumSqG_logit = T.sum(sqG_logit, axis=(1, 2, 3))
        logitSqSensLoss = aggregate(sumSqG_logit)

        # probability sensitivity
        prob = T.sum(networkOutput * y_oneHot, axis=1)
        G_prob = T.grad(T.sum(prob), networkInput)

        if std is not None:
            G_prob = std * G_prob

        # Sparse probability saliency regularization
        absG_prob = T.abs_(G_prob)
        sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3))
        probSensLoss = aggregate(sumAbsG_prob)

        # Loss sensitivity
        G_loss = theano.grad(T.sum(L_oneHot), networkInput)
        if std is not None:
            G_loss = std * G_loss
        absG_loss = T.abs_(G_loss)
        sumAbsG_loss = T.sum(absG_loss, axis=(1, 2, 3))
        lossSensLoss = aggregate(sumAbsG_loss)

        ####### !!!!!!!!!!!!!!!!!!! EXPERIMENTAL !!!!!!!!!!!!!!!!!! ##########
        #### !!!! only makes sense for 2-class problems in this case !!!! ####

        # Clumsy way to regularize logit differences
        # It works by replacing the matrix of one-hot encoded labels by one
        # whose first column is ones and the rest is minus ones. After summing
        # over each row, we are left with the difference of the logit of the
        # first class and the (sum of the) other class(es).

        plusMinusOneMatrix = 2 * lasagne.utils.one_hot(
            1, outputLayer.output_shape[1]) - T.ones_like(y_oneHot)
        logitDiff = T.sum(logitOutput * plusMinusOneMatrix, axis=1)
        G_logitDiff = T.grad(T.sum(logitDiff), networkInput)

        if std is not None:
            G_logitDiff = std * G_logitDiff

        absG_logitDiff = T.abs_(G_logitDiff)
        sumAbsG_logitDiff = T.sum(absG_logitDiff, axis=(1, 2, 3))
        logitDiffSensLoss = aggregate(sumAbsG_logitDiff)

        # Sum up
        totalLoss = classificationLoss
        if l1: totalLoss += regMultiplier * l1 * l1Loss
        if l2: totalLoss += regMultiplier * l2 * l2Loss
        if logitSens: totalLoss += regMultiplier * logitSens * logitSensLoss
        if logitDiffSens:
            totalLoss += regMultiplier * logitDiffSens * logitDiffSensLoss
        if logitSqSens:
            totalLoss += regMultiplier * logitSqSens * logitSqSensLoss
        if probSens: totalLoss += regMultiplier * probSens * probSensLoss
        if lossSens: totalLoss += regMultiplier * lossSens * lossSensLoss

        return classificationLoss, totalLoss, l1Loss, l2Loss, logitSensLoss, logitDiffSensLoss, logitSqSensLoss, probSensLoss, lossSensLoss
Example #46
0
def modifiedObjective(layers,
                      loss_function,
                      target,
                      aggregate=aggregate,
                      deterministic=False,
                      l1=0,
                      l2=0,
                      logitSens=0,
                      probSens=0,
                      lossSens=0,
                      std=None,
                      get_output_kw=None):
    """
    Modified implementation of the NeuralNet objective.

    :param layers: The underlying layers of the NeuralNetwork
    :param loss_function: The callable loss function to use
    :param target: the expected output
    :param aggregate: the aggregation function to use
    :param deterministic: Whether or not to get a deterministic output
    :param l1: Optional l1 regularization parameter
    :param l2: Optional l2 regularization parameter
    :param lossSens: Optional loss sensitivity regularization parameter
    :param lossSens: Optional loss sensitivity regularization parameter
    :param lossSens: Optional loss sensitivity regularization parameter
    :param get_output_kw: optional kwargs to pass to
                          :meth:`NeuralNetwork.get_output`
    :return: The total calculated loss
    """
    if get_output_kw is None:
        get_output_kw = {}
    output_layer = layers[-1]
    logit_layer = layers[-2]
    input_layer = layers[0]
    network_input = input_layer.input_var
    network_output = get_output(output_layer,
                                deterministic=deterministic,
                                **get_output_kw)
    logit_output = get_output(logit_layer,
                              deterministic=deterministic,
                              **get_output_kw)

    L = loss_function(
        network_output,
        lasagne.utils.one_hot(target, output_layer.output_shape[1]))
    loss = aggregate(L)

    if l1:
        loss += regularization.regularize_layer_params(layers.values(),
                                                       regularization.l1) * l1

    if l2:
        loss += regularization.regularize_layer_params(layers.values(),
                                                       regularization.l2) * l2

    # logit sensitivity
    if logitSens:
        logit = T.sum(
            logit_output *
            lasagne.utils.one_hot(target, output_layer.output_shape[1]),
            axis=1)
        G_logit = T.grad(T.sum(logit), network_input)

        if std is not None:
            G_logit = std * G_logit

        # Sparse saliency regularization
        absG_logit = T.abs_(G_logit)
        sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3))
        loss += aggregate(sumAbsG_logit) * logitSens

    # probability sensitivity
    if probSens:
        prob = T.sum(
            network_output *
            lasagne.utils.one_hot(target, output_layer.output_shape[1]),
            axis=1)
        G_prob = T.grad(T.sum(prob), network_input)

        if std is not None:
            G_prob = std * G_prob

        # Sparse saliency regularization
        absG_prob = T.abs_(G_prob)
        sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3))
        loss += aggregate(sumAbsG_prob) * probSens

    # Loss sensitivity
    if lossSens:
        G_loss = theano.grad(T.sum(L), network_input)
        if std is not None:
            G_loss = std * G_loss
        absG_loss = T.abs_(G_loss)
        loss += aggregate(T.sum(absG_loss, axis=(1, 2, 3))) * lossSens

        # Double Backpropagation, uncomment if desired
        #sqG = G**2
        #sumSqG = T.sum(sqG,axis = (1,2,3))
        #loss += aggregate(sumSqG) * tv
    return loss
Example #47
0
def set_network_trainer(input_data,
                        input_mask,
                        target_data,
                        target_mask,
                        num_outputs,
                        network,
                        updater,
                        learning_rate,
                        grad_max_norm=10.,
                        l2_lambda=1e-5,
                        load_updater_params=None):
    # get one hot target
    one_hot_target_data = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                                 nb_class=num_outputs,
                                                 dtype=floatX)

    # get network output data
    predict_data = get_output(network, deterministic=False)
    num_seqs = predict_data.shape[0]

    # get prediction cost
    predict_data = T.reshape(x=predict_data,
                             newshape=(-1, num_outputs),
                             ndim=2)
    predict_data = predict_data - T.max(predict_data, axis=-1, keepdims=True)
    predict_data = predict_data - T.log(T.sum(T.exp(predict_data), axis=-1, keepdims=True))
    train_predict_cost = -T.sum(T.mul(one_hot_target_data, predict_data), axis=-1)
    train_predict_cost = train_predict_cost*T.flatten(target_mask, 1)
    train_model_cost = train_predict_cost.sum()/num_seqs
    train_frame_cost = train_predict_cost.sum()/target_mask.sum()

    # get regularizer cost
    train_regularizer_cost = regularize_network_params(network, penalty=l2)*l2_lambda

    # get network parameters
    network_params = get_all_params(network, trainable=True)

    # get network gradients
    network_grads = theano.grad(cost=train_model_cost + train_regularizer_cost,
                                wrt=network_params)

    if grad_max_norm>0.:
        network_grads, network_grads_norm = total_norm_constraint(tensor_vars=network_grads,
                                                                  max_norm=grad_max_norm,
                                                                  return_norm=True)
    else:
        network_grads_norm = T.sqrt(sum(T.sum(grad**2) for grad in network_grads))

    # set updater
    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, trainer_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    # get training (update) function
    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[train_frame_cost,
                                           network_grads_norm],
                                  updates=train_updates)
    return training_fn, trainer_params
Example #48
0
        (onefeatureplane.dimshuffle('x', 0, 1) * featuremap).sum(2).sum(1),
        outputs_info=None,
        sequences=[expression[0, :, :, :]],
        non_sequences=expression[0, :, :, :])

    layer_style_cost.append(
        ((grammian_testimage - grammian_original)**2).sum() /
        (2 * (styleimage_layer.shape[2] * styleimage_layer.shape[3])**2 *
         (styleimage_layer.shape[1])**2))
    #layer_style_cost_function.append(theano.function([input_var], layer_style_cost[-1]))

    #DEFINE TOTAL COST AS WEIGHTED SUM OF CONTENT AND STYLE COST
    totalcost += contentweights[layerindex] * layer_content_cost[
        layerindex] + styleweights[layerindex] * layer_style_cost[layerindex]

totalgrad = theano.grad(totalcost, input_var)

#COMPILE THEANO FUNCTIONS:
cost = theano.function([input_var], totalcost)
grad = theano.function([input_var], totalgrad)


#CONJGRAD BASED OPTIMIZATION FOR POTENTIALLY FASTER OPTIMIZATION (REQUIRES minimize.py):
def conjgrad(im, maxnumlinesearch=10, imshape=styleimage.shape):
    import minimize
    im_flat, fs, numlinesearches = minimize.minimize(
        im.flatten(),
        lambda x: cost(x.reshape(imshape)),
        lambda x: grad(x.reshape(imshape)).flatten(),
        args=[],
        maxnumlinesearch=maxnumlinesearch,
Example #49
0
    name = layer.__class__.__name__
    num_param = sum([np.prod(p.get_value().shape) for p in layer.get_params()])
    num_param = num_param.__str__()
    print('    %s %s %s' % (name, num_param, layer.output_shape))

y = T.cast(T.flatten(x[:, 1:]), 'int32')
# training loss
p1 = T.reshape(T.log(predictions[T.arange(y.shape[0]), y]), mask.shape)
loss = -1. * T.mean(T.sum(mask * p1, axis=1), axis=0)

# validation loss (with disabled dropout)
p1_det = T.reshape(T.log(predictions_det[T.arange(y.shape[0]), y]), mask.shape)
loss_det = -1. * T.mean(T.sum(mask * p1_det, axis=1), axis=0)

learning_rate = theano.shared(np.float32(config.learning_rate))
grads = theano.grad(loss, all_params)
updates = nn.updates.rmsprop(grads, all_params, config.learning_rate)

train = theano.function([x, mask], loss, updates=updates)
validate = theano.function([x, mask], loss_det)


def create_batch(idxs):
    max_seq_len = max([len(tunes[i]) for i in idxs])
    x = np.zeros((config.batch_size, max_seq_len), dtype='float32')
    mask = np.zeros((config.batch_size, max_seq_len - 1), dtype='float32')
    for i, j in enumerate(idxs):
        x[i, :tune_lens[j]] = tunes[j]
        mask[i, :tune_lens[j] - 1] = 1
    return x, mask
Example #50
0
 def train_function(self, semi_supervised=True, unlabel_stable=False):
     '''
     use_unlabel == True, semi-superviesd learning
     return: train function for 1 epoch use
     '''
     self.semi_supervised = semi_supervised
     sym_klw = T.scalar(
         'sym_klw',
         dtype=theano.config.floatX)  # symbolic scalar of warming up
     sym_cw = T.scalar('sym_cw',
                       dtype=theano.config.floatX)  # classifier warm up
     sym_s = T.matrix('sym_s', dtype='int64')
     sym_mask = T.matrix('sym_mask', dtype=theano.config.floatX)
     sym_y = T.matrix('sym_label', dtype=theano.config.floatX)
     sym_s_u = T.matrix('sym_s_u', dtype='int64')
     sym_mask_u = T.matrix('sym_mask_u', dtype=theano.config.floatX)
     num_l, num_u = sym_s.shape[0].astype(theano.config.floatX), 0.0
     if self.semi_supervised:
         print 'Train with unlabel data.'
         num_u = sym_s_u.shape[0].astype(theano.config.floatX)
     #get labeled/unlabeled cost
     outs1 = self.cost_label([sym_s, sym_mask, sym_y],
                             dev_stage=False,
                             return_mode='mean')
     loss_recons, loss_kl, valid_words, word_drop_num, loss_classifier, batch_ppl, acc = outs1
     loss_recons_u, loss_kl_u, loss_entropy_u, batch_ppl_u = 0.0, 0.0, 0.0, 0.0
     valid_words_u = 0
     if self.semi_supervised:
         outs2 = self.cost_unlabel([sym_s_u, sym_mask_u],
                                   dev_stage=unlabel_stable,
                                   sample_by_prob=self.sample_unlabel)
         loss_recons_u, loss_kl_u, valid_words_u, loss_entropy_u, batch_ppl_u = outs2
     '''
     total Loss:
     L = Loss_labeled(s,mask,y) + beta*(n_l+n_u)/n_l * Loss_classisifer(s,mask,y)
         + Loss_unlabel(s_u, mask_u)
     L = recons_term + sym_klw_term + loss_classifier_term - loss_entropy_u
     '''
     alpha = sym_cw * self.cost_beta * (num_l + num_u) / num_l
     total_cost = loss_recons * num_l + loss_recons_u * num_u\
                  + sym_klw * ( loss_kl * num_l + loss_kl_u * num_u)\
                  + alpha * loss_classifier * num_l\
                  - loss_entropy_u * num_u
     total_cost /= (num_l + num_u)
     train_params = self.get_params(only_trainable=True)
     all_grads = theano.grad(total_cost, train_params)
     all_grads = [
         T.clip(g, -self.grad_clipping, self.grad_clipping)
         for g in all_grads
     ]
     all_grads = total_norm_constraint(all_grads, max_norm=self.max_norm)
     #all_grads = [T.clip(g, -self.grad_clipping, self.grad_clipping) for g in all_grads]
     updates = adam(all_grads, train_params, self.lr, self.beta1,
                    self.beta2)
     if self.semi_supervised:
         train_input = [
             sym_s, sym_mask, sym_y, sym_s_u, sym_mask_u, sym_klw, sym_cw
         ]
         train_output = [
             total_cost, loss_recons, loss_recons_u, loss_kl, loss_kl_u,
             alpha, loss_classifier, loss_entropy_u, batch_ppl, batch_ppl_u,
             valid_words, valid_words_u, word_drop_num, acc
         ]
     else:
         train_input = [sym_s, sym_mask, sym_y, sym_klw, sym_cw]
         train_output = [
             total_cost, loss_recons, loss_kl, loss_classifier, batch_ppl,
             valid_words, word_drop_num, acc
         ]
     train_f = theano.function(inputs=train_input,
                               outputs=train_output,
                               updates=updates,
                               name='train_function')
     return train_f
Example #51
0
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)
snap_dist_info_vars = snap_policy.dist_info_sym(observations_var)

surr = TT.sum(
    -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var)

params = policy.get_params(trainable=True)
snap_params = snap_policy.get_params(trainable=True)

importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(
    actions_var, snap_dist_info_vars, dist_info_vars)

grad = theano.grad(surr, params)

eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype)
eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype)

surr_on1 = TT.sum(
    -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var * importance_weights_var)
surr_on2 = TT.sum(
    snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars)
    * d_rewards_var)
grad_SVRG = [
    sum(x) for x in zip([eval_grad1, eval_grad2, eval_grad3, eval_grad4],
                        theano.grad(surr_on1, params),
Example #52
0
    def _setup(self):

        self.all_params = []

        self.all_conv_results = []
        self.all_conv_pool_results = []
        self.all_conv_names = []

        self.x_document_input = T.imatrix(
            'x_doc')  # words from the source document

        self.x_document_id = T.ivector(
            'x_doc_id')  # index of which source doucment this is from
        self.x_surface_text_input = T.imatrix(
            'x_surface_link')  # text of the surface link
        self.x_surface_context_input = T.imatrix(
            'x_surface_cxt')  #  words surrounding the surface link

        self.x_target_input = T.ivector('x_target')  # id of the target vector
        self.x_target_words = T.imatrix(
            'x_target_words')  # words from the target title link
        self.x_matches_surface = T.ivector(
            'x_match_surface'
        )  # indicator if the target title matches the surface
        self.x_matches_counts = T.imatrix(
            'x_matches_counts')  # info about the link counts
        self.x_target_document_words = T.imatrix(
            'x_target_document_words'
        )  # words from the body of target document
        self.x_link_id = T.ivector(
            'x_link_id')  # indx of what link to compare to in the matrix

        self.x_denotaiton_features = T.matrix(
            'x_denotation_ind_feats',
            dtype='int8')  # the joint denotation query features
        self.x_query_featurs = T.matrix('x_query_ind_feats',
                                        dtype='int8')  # the query features
        self.x_query_link_id = T.ivector(
            'x_match_query')  # the query that a denotation links to
        self.x_denotation_ranges = T.imatrix(
            'x_denotation_ranges'
        )  # the range of joint denotations to sum over

        self.x_target_link_id = T.ivector(
            'x_match_target'
        )  # the target document that maches with a given denotation

        self.y_isgold = T.vector(
            'y_gold', dtype='int8')  # is 1 if the gold item, 0 otherwise
        self.y_grouping = T.imatrix(
            'y_grouping')  # matrix containing [start_idx, end_idx, gold_idx]

        self.embedding_W = theano.shared(
            self.wordvecs.get_numpy_matrix().astype(theano.config.floatX),
            name='embedding_W')
        self.embedding_W_docs = theano.shared(
            self.documentvecs.get_numpy_matrix().astype(theano.config.floatX),
            name='embedding_W_docs')

        def augRectify(x):
            # if x is zero, then the gradient failes due to computation: x / |x|
            return T.maximum(x, -.01 * x)

        simpleConvNonLin = augRectify

        self.document_l = lasagne.layers.InputLayer(
            (None, self.document_length), input_var=self.x_document_input)

        self.document_embedding_l = EmbeddingLayer(
            self.document_l,
            W=self.embedding_W,
            add_word_params=self.enable_train_wordvecs,
        )

        self.document_simple_conv1_l = lasagne.layers.Conv2DLayer(
            self.document_embedding_l,
            num_filters=self.dim_compared_vec,
            filter_size=(self.num_words_to_use_conv,
                         self.wordvecs.vector_size),
            name='document_simple_conv',
            nonlinearity=simpleConvNonLin,
        )

        self.document_simple_sum_l = lasagne.layers.Pool2DLayer(
            self.document_simple_conv1_l,
            name='document_simple_pool',
            pool_size=(self.document_length - self.num_words_to_use_conv, 1),
            mode='sum',
        )

        self.all_conv_pool_results.append(
            lasagne.layers.get_output(self.document_simple_sum_l))

        self.document_output = lasagne.layers.get_output(
            lasagne.layers.reshape(self.document_simple_sum_l, ([0], -1)))

        self.all_params += lasagne.layers.get_all_params(
            self.document_simple_sum_l)

        ##########################################
        ## surface text

        self.surface_context_l = lasagne.layers.InputLayer(
            (None, self.sentence_length),
            input_var=self.x_surface_context_input,
        )

        self.surface_context_embedding_l = EmbeddingLayer(
            self.surface_context_l,
            W=self.embedding_W,
            add_word_params=self.enable_train_wordvecs,
        )

        self.surface_context_conv1_l = lasagne.layers.Conv2DLayer(
            self.surface_context_embedding_l,
            num_filters=self.dim_compared_vec,
            filter_size=(self.num_words_to_use_conv,
                         self.wordvecs.vector_size),
            name='surface_cxt_conv1',
            nonlinearity=simpleConvNonLin,
        )

        self.surface_context_pool1_l = lasagne.layers.Pool2DLayer(
            self.surface_context_conv1_l,
            name='surface_cxt_pool1',
            pool_size=(self.sentence_length - self.num_words_to_use_conv, 1),
            mode='sum',  # WAS 'MAX' FOR SOME REASON
        )

        self.all_conv_pool_results.append(
            lasagne.layers.get_output(self.surface_context_pool1_l))

        self.surface_output = lasagne.layers.get_output(
            lasagne.layers.reshape(self.surface_context_pool1_l, ([0], -1)))

        self.all_params += lasagne.layers.get_all_params(
            self.surface_context_pool1_l)

        self.surface_input_l = lasagne.layers.InputLayer(
            (None, self.sentence_length_short),
            input_var=self.x_surface_text_input)

        self.surface_embedding_l = EmbeddingLayer(
            self.surface_input_l,
            W=self.embedding_W,
            add_word_params=self.enable_train_wordvecs,
        )

        self.surface_conv1_l = lasagne.layers.Conv2DLayer(
            self.surface_embedding_l,
            num_filters=self.dim_compared_vec,
            filter_size=(self.num_words_to_use_conv,
                         self.wordvecs.vector_size),
            name='surface_conv1',
            nonlinearity=simpleConvNonLin,
        )

        self.surface_pool1_l = lasagne.layers.Pool2DLayer(
            self.surface_conv1_l,
            name='surface_pool1',
            pool_size=(self.sentence_length_short - self.num_words_to_use_conv,
                       1),
            mode='sum',
        )

        self.all_conv_pool_results.append(
            lasagne.layers.get_output(self.surface_pool1_l))

        self.surface_words_output = lasagne.layers.get_output(
            lasagne.layers.reshape(self.surface_pool1_l, ([0], -1)))

        self.all_params += lasagne.layers.get_all_params(self.surface_pool1_l)

        ###################################################
        ## dealing with the target side

        # matched_surface_reshaped = self.x_matches_surface.reshape(
        #     (self.x_matches_surface.shape[0], 1, 1, 1)).astype(theano.config.floatX)

        self.target_input_l = lasagne.layers.InputLayer(
            (None, ), input_var=self.x_target_input)

        #################################
        ## target indicators features

        ## these have been replaced with the indicatores as provided by the scala system
        # self.target_matched_surface_input_l = lasagne.layers.InputLayer(
        #     (None,1,1,1),
        #     input_var=matched_surface_reshaped,
        # )

        # self.target_matched_counts_input_l = lasagne.layers.InputLayer(
        #     (None,5),
        #     input_var=self.x_matches_counts.astype(theano.config.floatX),
        # )

        # words from the title of the target
        self.target_words_input_l = lasagne.layers.InputLayer(
            (None, self.sentence_length_short),
            input_var=self.x_target_words,
        )

        self.target_words_embedding_l = EmbeddingLayer(
            self.target_words_input_l,
            W=self.embedding_W,
            add_word_params=self.enable_train_wordvecs,
        )

        self.target_words_conv1_l = lasagne.layers.Conv2DLayer(
            self.target_words_embedding_l,
            name='target_wrds_conv1',
            filter_size=(self.num_words_to_use_conv,
                         self.wordvecs.vector_size),
            num_filters=self.dim_compared_vec,
            nonlinearity=simpleConvNonLin,
        )

        self.target_words_pool1_l = lasagne.layers.Pool2DLayer(
            self.target_words_conv1_l,
            name='target_wrds_pool1',
            pool_size=(self.sentence_length_short - self.num_words_to_use_conv,
                       1),
            mode='sum',
        )

        self.all_conv_pool_results.append(
            lasagne.layers.get_output(self.target_words_pool1_l))

        self.target_title_out = lasagne.layers.get_output(
            lasagne.layers.reshape(self.target_words_pool1_l, ([0], -1)))

        self.all_params += lasagne.layers.get_all_params(
            self.target_words_pool1_l)

        # words from the body of the target
        self.target_body_words_input_l = lasagne.layers.InputLayer(
            (None, self.sentence_length),
            input_var=self.x_target_document_words,
        )

        self.target_body_words_embedding_l = EmbeddingLayer(
            self.target_body_words_input_l,
            W=self.embedding_W,
            add_word_params=self.enable_train_wordvecs,
        )

        self.target_body_simple_conv1_l = lasagne.layers.Conv2DLayer(
            self.target_body_words_embedding_l,
            name='target_body_simple_conv',
            filter_size=(self.num_words_to_use_conv,
                         self.wordvecs.vector_size),
            num_filters=self.dim_compared_vec,
            nonlinearity=simpleConvNonLin,
        )

        self.target_body_simple_sum_l = lasagne.layers.Pool2DLayer(
            self.target_body_simple_conv1_l,
            name='target_body_simple_sum',
            pool_size=(self.sentence_length - self.num_words_to_use_conv, 1),
            mode='sum',
        )

        self.all_conv_pool_results.append(
            lasagne.layers.get_output(self.target_body_simple_sum_l))

        self.target_out = lasagne.layers.get_output(
            lasagne.layers.reshape(self.target_body_simple_sum_l, ([0], -1)))

        self.all_params += lasagne.layers.get_all_params(
            self.target_body_simple_sum_l)

        #########################################################
        ## compute the cosine distance between the two layers

        # the are going to multiple entity links per document so we have the `_id` ivectors that represent how
        # we need to reshuffle the inputs, this saves on computation

        # source body
        self.source_aligned_l = self.document_output[self.x_document_id, :][
            self.x_link_id, :]
        # source context
        self.source_context_aligned_l = self.surface_output[self.x_link_id, :]
        # source surface words
        self.source_surface_words_aligned_l = self.surface_words_output[
            self.x_link_id, :]

        def augNorm(v):
            return T.basic.pow(
                T.basic.pow(T.basic.abs_(v), 2).sum(axis=1) + .001, .5)

        def cosinsim(a, b):
            dotted = T.batched_dot(a, b)
            return dotted / (augNorm(a) * augNorm(b))

        def comparedVLayers(a, b):
            dv = cosinsim(a, b)
            return lasagne.layers.InputLayer((None, 1),
                                             input_var=dv.reshape(
                                                 (dv.shape[0], 1)))

        self.cosine_conv_layers = []

        for i, l in enumerate([
                comparedVLayers(self.target_out, self.source_aligned_l),
                comparedVLayers(self.target_out,
                                self.source_context_aligned_l),
                comparedVLayers(self.target_out,
                                self.source_surface_words_aligned_l),
                comparedVLayers(self.target_title_out, self.source_aligned_l),
                comparedVLayers(self.target_title_out,
                                self.source_context_aligned_l),
                comparedVLayers(self.target_title_out,
                                self.source_surface_words_aligned_l),
        ]):
            if i not in disable_convs:
                self.cosine_conv_layers.append(l)

        if len(self.cosine_conv_layers) != 0:
            self.cosine_combined = lasagne.layers.concat(
                self.cosine_conv_layers, axis=1)

            self.cosine_weighted = lasagne.layers.DenseLayer(
                self.cosine_combined,
                name='cosine_dens1',
                num_units=1,
                b=None,
                nonlinearity=lasagne.nonlinearities.linear,
            )

            # encourage these weights to be positive
            self.cosine_weighted.W.get_value(borrow=True)[:] += 1

            self.cosine_output = lasagne.layers.get_output(
                lasagne.layers.reshape(self.cosine_weighted, (-1, )))

            self.all_params += lasagne.layers.get_all_params(
                self.cosine_weighted)

            self.aligned_cosine = self.cosine_output[self.x_target_link_id]

        ######################################################
        ## indicator feature input

        self.query_feat_l = lasagne.layers.InputLayer(
            (None, self.num_indicator_features),
            input_var=self.x_query_featurs,
        )

        #rank_feats = [f[0] for f in enumerate(featuresNames) if f[1].startswith('Rank=')]

        self.denotation_join_feat_l = lasagne.layers.InputLayer(
            (None, self.num_indicator_features),
            input_var=self.x_denotaiton_features,  #[:, rank_feats],
        )

        ## the query and denotation features are now combined when inputed into the same denotation vector

        # self.query_layer_l = lasagne.layers.DenseLayer(
        #     self.query_feat_l,
        #     name='query_lin',
        #     num_units=1,
        #     nonlinearity=lasagne.nonlinearities.linear,
        # )

        # self.query_output = lasagne.layers.get_output(
        #     lasagne.layers.reshape(self.query_layer_l, (-1,))
        # )

        # self.all_params += lasagne.layers.get_all_params(self.query_layer_l)

        # self.aligned_queries = self.query_output[self.x_query_link_id]

        self.denotation_layer_l = lasagne.layers.DenseLayer(
            self.denotation_join_feat_l,
            name='denotation_lin',
            num_units=1,
            nonlinearity=lasagne.nonlinearities.linear,
            #W=self.query_layer_l.W,
        )

        self.denotation_output = lasagne.layers.get_output(
            lasagne.layers.reshape(self.denotation_layer_l, (-1, )))

        self.all_params += lasagne.layers.get_all_params(
            self.denotation_layer_l)

        ###########################
        ## multiply the two parts of the join scores

        self.unmerged_scores = (
            (  #(self.aligned_queries) +
                (self.denotation_output if 1000 not in disable_convs else 0)) +
            (self.aligned_cosine if len(self.cosine_conv_layers) != 0 else 0))

        #############################################
        ## normalizing the scores and recombining
        ## the output if there were multiple entries
        ## for the same target document
        #############################################

        def sloppyMathLogSum(vals):
            m = vals.max()
            return T.log(T.exp(vals - m).sum()) + m

        def mergingSum(indx, unmerged):
            return sloppyMathLogSum(unmerged[T.arange(indx[0], indx[1])])

        self.merged_scores, _ = theano.scan(
            mergingSum,
            sequences=[self.x_denotation_ranges],
            non_sequences=[self.unmerged_scores])

        ########################################
        ## true output values
        ########################################

        self.unscaled_output = self.merged_scores

        def scaleRes(indx, outputs, res):
            ran = T.arange(indx[0], indx[1])
            s = sloppyMathLogSum(res[ran])
            return T.set_subtensor(outputs[ran], res[ran] - s)

        self.scaled_scores, _ = theano.scan(
            scaleRes,
            sequences=[self.y_grouping],
            non_sequences=[self.unscaled_output],
            outputs_info=T.zeros((self.unscaled_output.shape[0], )))

        self.true_output = self.scaled_scores[-1]

        ############################
        ## compute the loss
        ############################

        def lossSum(indx, res):
            return sloppyMathLogSum(res[T.arange(indx[0], indx[1])])

        self.groupped_res, _ = theano.scan(
            lossSum,
            sequences=[self.y_grouping],
            non_sequences=[self.true_output],
        )

        def selectGolds(indx, res, golds):
            r = T.arange(indx[0], indx[1])
            # fix some issue with theano?
            # the gold value should simply comes from the input
            # so there is no good reason to have to disconnect the graident here
            gs = theano.gradient.disconnected_grad(golds[r])
            vals = gs * res[r] + (1 - gs) * -1000000  # approx 0
            return sloppyMathLogSum(vals)

        self.gold_res, _ = theano.scan(
            selectGolds,
            sequences=[self.y_grouping],
            non_sequences=[self.true_output, self.y_isgold],
        )

        self.loss_vec = self.groupped_res - self.gold_res

        self.loss_scalar = self.loss_vec.sum()

        self.updates = lasagne.updates.adadelta(
            theano.grad(self.loss_scalar / self.loss_vec.shape[0],
                        self.all_params,
                        disconnected_inputs='warn'), self.all_params)

        self.func_inputs = [
            self.x_document_input,
            self.x_surface_text_input,
            self.x_surface_context_input,
            self.x_document_id,
            self.x_target_input,
            self.x_matches_surface,
            self.x_matches_counts,
            self.x_link_id,
            self.x_target_words,
            self.x_target_document_words,
            self.x_denotaiton_features,
            self.x_query_featurs,
            self.x_query_link_id,
            self.x_denotation_ranges,
            self.x_target_link_id,
            self.y_grouping,
            self.y_isgold,
        ]

        self.func_outputs = [
            self.true_output,
            self.loss_vec.sum(),
            self.loss_scalar,
            self.loss_vec,
            #self.res_l,
        ]

        dsc_out = lasagne.layers.get_output(self.document_simple_conv1_l)
        scc_out = lasagne.layers.get_output(self.surface_context_conv1_l)
        sc_out = lasagne.layers.get_output(self.surface_conv1_l)

        ttc_out = lasagne.layers.get_output(self.target_words_conv1_l)
        tbc_out = lasagne.layers.get_output(self.target_body_simple_conv1_l)

        # def cmp_convs(input, against):
        #     #T.dot(

        self.all_conv_names.append('document_conv')
        self.all_conv_results.append(
            dsc_out
        )  #cmp_convs(dsc_out[self.x_document_id][self.x_link_id], [ttc_out, tbc_out]))

        self.all_conv_names.append('surface_context_conv')
        self.all_conv_results.append(
            scc_out)  #cmp_convs(scc_out[self.x_link_id], [ttc_outp, tbc_out]))

        self.all_conv_names.append('surface_conv')
        self.all_conv_results.append(sc_out)

        self.all_conv_names.append('target_title_conv')
        self.all_conv_results.append(ttc_out)

        self.all_conv_names.append('target_body_conv')
        self.all_conv_results.append(tbc_out)

        self.train_func = theano.function(
            self.func_inputs,
            self.func_outputs,
            updates=self.updates,
            on_unused_input='ignore',
        )

        self.test_func = theano.function(
            self.func_inputs,
            self.func_outputs,
            on_unused_input='ignore',
        )

        self.find_conv_active_func = theano.function(
            self.func_inputs,
            self.all_conv_results,
            on_unused_input='ignore',
        )
Example #53
0
    def build_model(self, Dir_features, args):

        self._set_model_param(Dir_features)

        # try to scale the gradients on the level of parameters like caffe
        # by now only change the code with sgd
        scale_grad = True
        scale_l2_w = False

        TOL = 1e-5

        sym_y = T.imatrix()

        # W is regularizable, b is not regularizable (correspondence with caffe)
        if scale_grad:
            self.net['conv1a'].b.tag.grad_scale = 2
            self.net['conv2a'].b.tag.grad_scale = 2
            self.net['conv3a'].b.tag.grad_scale = 2
            self.net['conv3b'].b.tag.grad_scale = 2
            self.net['conv4a'].b.tag.grad_scale = 2
            self.net['conv4b'].b.tag.grad_scale = 2
            self.net['conv5a'].b.tag.grad_scale = 2
            self.net['conv5b'].b.tag.grad_scale = 2
            self.net['fc6-1'].b.tag.grad_scale = 2
            self.net['fc8-1'].W.tag.grad_scale = 10
            self.net['fc8-1'].b.tag.grad_scale = 20

        output_train = lasagne.layers.get_output(self.net['prob'],
                                                 deterministic=False)
        output_eval = lasagne.layers.get_output(self.net['prob'],
                                                deterministic=True)

        ##############
        # compute cost
        ##############
        # compute the cost for training
        output_flat = T.reshape(
            output_train,
            (self.batch_size, self.clip_length, self.num_classes))
        cost = T.mean(ctc_cost.cost(output_flat + TOL, sym_y))

        # maybe it is necessary to add l2_penalty to the cost
        regularizable_params = lasagne.layers.get_all_params(
            self.net['prob'], regularizable=True)
        l2_w = 0.0005
        all_layers = lasagne.layers.get_all_layers(self.net['prob'])
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * l2_w
        cost += l2_penalty

        # compute the cost for evaluation
        output_eval_flat = T.reshape(
            output_eval,
            (self.num_batch_eval, self.clip_length, self.num_classes))
        cost_eval = T.mean(ctc_cost.cost(output_eval_flat + TOL, sym_y))

        trainable_params = lasagne.layers.get_all_params(self.net['prob'],
                                                         trainable=True)

        sh_lr = theano.shared(lasagne.utils.floatX(args.lr))

        ##################################################################
        # try to scale the gradients on the level of parameters like caffe
        # by now only change the code with sgd
        ##################################################################
        if scale_grad:
            grads = theano.grad(cost, trainable_params)
            for idx, param in enumerate(trainable_params):
                grad_scale = getattr(trainable_params, 'grad_scale', 1)
                if grad_scale != 1:
                    grads[idx] *= grad_scale

        #################
        # compute updates
        #################
        # adam works with lr 0.001
        if args.optimizer == 'rmsprop':
            updates_opt = lasagne.updates.rmsprop(cost,
                                                  trainable_params,
                                                  learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'adam':
            updates_opt = lasagne.updates.adam(cost,
                                               trainable_params,
                                               learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'sgd':
            # Stochastic Gradient Descent (SGD) with momentum
            if scale_grad:
                updates = lasagne.updates.momentum(grads,
                                                   trainable_params,
                                                   learning_rate=sh_lr,
                                                   momentum=0.9)
            else:
                updates = lasagne.updates.momentum(cost,
                                                   trainable_params,
                                                   learning_rate=sh_lr,
                                                   momentum=0.9)

        elif args.optimizer == 'adadelta':
            updates_opt = lasagne.updates.adadelta(cost,
                                                   trainable_params,
                                                   learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        elif args.optimizer == 'adagrad':
            updates_opt = lasagne.updates.adagrad(cost,
                                                  trainable_params,
                                                  learning_rate=sh_lr)
            updates = lasagne.updates.apply_momentum(updates_opt,
                                                     trainable_params,
                                                     momentum=0.9)

        #############################
        # set train and eval function
        #############################
        f_train = theano.function(
            [self.net['input'].input_var, sym_y, self.net['mask'].input_var],
            [cost, output_train],
            updates=updates)
        f_eval = theano.function(
            [self.net['input'].input_var, sym_y, self.net['mask'].input_var],
            [cost_eval, output_eval])

        return f_train, f_eval
def jax_model_and_grad(x):
    return jax_model(x), jax.grad(jax_model)(x)


def jax_logp_dlogp_func(x):
    v, g = jax_model_and_grad(x)
    return np.asarray(v), np.asarray(g)


with pm.Model() as pm_model:
    pm_params = pm.Flat("pm_params", shape=3)
    mean = pm_params[0] * x + pm_params[1]
    pm.Normal("obs", mu=mean, sigma=pm.math.exp(pm_params[2]), observed=y_obs)

pm_model_and_grad = pm_model.fastfn([pm_model.logpt] +
                                    theano.grad(pm_model.logpt, pm_model.vars))


def pm_logp_dlogp_func(x):
    return pm_model_and_grad(pm_model.bijection.rmap(x))


@pytest.mark.parametrize(
    "framework",
    ["pytorch", "jax", "pymc3"],
)
def test_multiprocessing_with_various_frameworks(framework):
    logp_dlogp_funcs = {
        "pytorch": torch_logp_dlogp_func,
        "jax": jax_logp_dlogp_func,
        "pymc3": pm_logp_dlogp_func,
Example #55
0
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)
snap_dist_info_vars = snap_policy.dist_info_sym(observations_var)

surr = TT.sum(
    -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var)

params = policy.get_params(trainable=True)
snap_params = snap_policy.get_params(trainable=True)

importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(
    actions_var, dist_info_vars, snap_dist_info_vars)

grad = theano.grad(surr, params)

eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype)
eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype)
eval_grad5 = TT.vector('eval_grad5', dtype=grad[3].dtype)

surr_on1 = TT.sum(
    dist.log_likelihood_sym_1traj_GPOMDP(actions_var, snap_dist_info_vars) *
    d_rewards_var * importance_weights_var)
surr_on2 = TT.sum(
    -snap_dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var)
grad_imp = theano.grad(surr_on1, snap_params)
        sharedY = theano.shared(np.random.randn(
            bs, no, (ih - kh) / dh + 1, (iw - kw) / dw + 1).astype('float32'),
                                name='sharedY')
        sharedW = theano.shared(
            np.random.randn(*filter_shape).astype('float32'), name='sharedW')
    except MemoryError, e:
        print "SKIPPING config due to the memory error below"
        print e
        continue
    X = theano.tensor.tensor4('X')
    Y = theano.tensor.nnet.conv.conv2d(X,
                                       sharedW,
                                       input_shape,
                                       filter_shape,
                                       subsample=(dh, dw))
    gW = theano.grad(None, wrt=sharedW, known_grads={Y: sharedY})
    gX = theano.grad(None, wrt=X, known_grads={Y: sharedY})
    #    if 'legacy' not in skip_tests:
    #       benchmark_three_ways('theano.tensor.nnet.conv.conv2d',
    #                           sharedX, sharedY, sharedW, X, Y, gW, gX,
    #                          mode.excluding('conv_gemm', 'conv_dnn'))

    # benchmark Theano meta-optimizer
    # Mimic THEANO_FLAGS=optimizer_including=conv_meta
    #    if 'meta' not in skip_tests:
    #       benchmark_three_ways('(experimental) meta-optimizer',
    #                           sharedX, sharedY, sharedW, X, Y, gW, gX,
    #                          mode.including('conv_meta'))

    # benchmark Theano FFT convolution
    # Mimic THEANO_FLAGS=optimizer_including=conv_fft
Example #57
0
controller = build_controller()

controller_parameters = lasagne.layers.helper.get_all_params(
    controller["output"])

states, all_parameters, updates = build_model()
fitness = build_objectives(states)
fitness = T.switch(T.isnan(fitness) + T.isinf(fitness), np.float32(0), fitness)

#import theano.printing
#theano.printing.debugprint(T.mean(fitness), print_type=True)
print "Finding gradient since %s..." % strftime("%H:%M:%S", localtime())
loss = -T.mean(fitness)

grads = theano.grad(loss, all_parameters)
grads = lasagne.updates.total_norm_constraint(grads, 1.0)

grads = [T.switch(T.isnan(g) + T.isinf(g), np.float32(0), g) for g in grads]

#grad_norm = T.sqrt(T.sum([(g**2).sum() for g in theano.grad(loss, all_parameters)])+1e-9)
#theano_to_print.append(grad_norm)
updates.update(lasagne.updates.adam(grads, all_parameters,
                                    0.0001))  # we maximize fitness
print "Compiling since %s..." % strftime("%H:%M:%S", localtime())
iter_test = theano.function([], [states[1], states[2], states[3]])
st = iter_test()
with open("state-dump-%s.pkl" % EXP_NAME, 'wb') as f:
    pickle.dump({
        "states": st,
        "json": open(jsonfile, "rb").read()
Example #58
0
def main():
    from fast_gp import sparse_w
    np.random.seed(0)
    n_data = 10
    x = np.random.uniform(size=n_data)
    #x = np.float32(x)
    x = np.sort(x)
    a = .1
    b = 10
    c = .001
    mu = np.zeros(n_data)
    cov = a * np.exp(-b * (x[:, np.newaxis] - x)**2) + c * np.eye(n_data)
    y = np.random.multivariate_normal(mu, cov)
    #print x
    #print y
    x_min, x_max = x.min(), x.max()
    #len_u = 2048 + 1
    len_u = 1024 + 1
    #len_u = 128 + 1
    #len_u = 64
    extra_u = 2
    margin = (x_max - x_min) / (len_u - extra_u * 2) * 2
    u = np.linspace(x_min - margin, x_max + margin, len_u)
    x_test = u[1:]
    #x_test = np.linspace(x_min, x_max, 20)
    idx_train, w_train = sparse_w(u, x)
    idx_test, w_test = sparse_w(u, x_test)

    t_idx_train = T.imatrix()
    t_w_train = T.matrix()
    t_idx_test = T.imatrix()
    t_w_test = T.matrix()
    t_gp_params = T.vector()
    t_indep_noise = T.scalar()
    t_ys = T.matrix()
    t_y = T.vector()

    cov_vec = CovVec(u, kernel, symbolic_kernel)

    def linear_op(zs):
        return cov_vec(t_idx_train, t_w_train, t_idx_test, t_w_test,
                       t_gp_params, t_indep_noise, zs)

    n_lanczos_basis = 10
    batch_size = 10
    cov_zs = lanczos(linear_op, t_ys, n_lanczos_basis, batch_size)

    post_mean = PosteriorMean(u, kernel, symbolic_kernel)
    mu = post_mean(t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params,
                   t_indep_noise, t_y)

    gp_samples = mu.dimshuffle('x', 0) + cov_zs

    gp_samples_fn = theano.function([
        t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params,
        t_indep_noise, t_y, t_ys
    ], gp_samples)

    len_test = len(x_test)
    y_test = np.random.normal(size=(batch_size, len_test))
    gdraws = gp_samples_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y,
                           y_test)
    print gdraws.shape

    t_random_proj = T.matrix()
    val = (gp_samples * t_random_proj).sum()

    val_fn = theano.function([
        t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params,
        t_indep_noise, t_y, t_ys, t_random_proj
    ], val)

    grad_val_fn = theano.function([
        t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params,
        t_indep_noise, t_y, t_ys, t_random_proj
    ],
                                  theano.grad(val,
                                              wrt=[t_gp_params, t_indep_noise],
                                              consider_constant=[
                                                  t_idx_train, t_w_train,
                                                  t_idx_test, t_w_test, t_y,
                                                  t_ys, t_random_proj
                                              ]))

    grad_val_fn1 = theano.function([
        t_idx_train, t_w_train, t_idx_test, t_w_test, t_gp_params,
        t_indep_noise, t_y, t_ys, t_random_proj
    ],
                                   theano.grad(val,
                                               wrt=[t_random_proj],
                                               consider_constant=[
                                                   t_idx_train, t_w_train,
                                                   t_idx_test, t_w_test, t_y,
                                                   t_ys
                                               ]))

    random_proj = np.random.rand(batch_size, len_test)
    t1 = time.time()
    for _ in xrange(10):
        grad_val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y, y_test,
                    random_proj)
    t2 = time.time()
    print t2 - t1
    t1 = time.time()
    for _ in xrange(10):
        grad_val_fn1(idx_train, w_train, idx_test, w_test, (a, b), c, y,
                     y_test, random_proj)
    t2 = time.time()
    print t2 - t1
    return

    n_test = 10
    for _ in xrange(n_test):
        random_proj = np.random.rand(batch_size, len_test)
        print 'test grad'
        print val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y,
                     y_test, random_proj)
        print grad_val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y,
                          y_test, random_proj)

        def val_fn1(x):
            a, b, c = x
            return val_fn(idx_train, w_train, idx_test, w_test, (a, b), c, y,
                          y_test, random_proj)

        def grad_val_fn1(x):
            a, b, c = x
            [a, b], c = grad_val_fn(idx_train, w_train, idx_test, w_test,
                                    (a, b), c, y, y_test, random_proj)
            return np.array([a, b, c])

        print scipy.optimize.check_grad(val_fn1, grad_val_fn1,
                                        np.array([a, b, c]))

    return
    import pylab as pl
    pl.figure()

    for each_sample in gdraws:
        pl.plot(x_test, each_sample, '-', c='b', alpha=.5)
    pl.plot(x, y, 'o', c='r')
    pl.show()
Example #59
0
# distribution of the actions. For a Gaussian policy, it contains the mean and the logarithm of the standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

# policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
# distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
# the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
# rllab.distributions.DiagonalGaussian
dist = policy.distribution

# Note that we negate the objective, since most optimizers assume a minimization problem
surr = -TT.mean(
    dist.log_likelihood_sym(actions_var, dist_info_vars) * returns_var)

# Get the list of trainable parameters.
params = policy.get_params(trainable=True)
grads = theano.grad(surr, params)

f_train = theano.function(inputs=[observations_var, actions_var, returns_var],
                          outputs=None,
                          updates=adam(grads,
                                       params,
                                       learning_rate=learning_rate),
                          allow_input_downcast=True)

for _ in range(n_itr):

    paths = []

    for _ in range(N):
        observations = []
        actions = []
Example #60
0
def adam_opt(model,
             train_set,
             valid_set,
             model_save_dir,
             minibatch=64,
             valid_period=1,
             total_period=0,
             disp_period=1,
             n_iters=1,
             lr=0.001,
             beta1=0.1,
             beta2=0.001,
             epsilon=1e-8,
             gamma=1 - 1e-8):
    """
    Adam optimizer (ICLR 2015)
    """
    # initialize learning rate
    lr_file = open(model_save_dir + 'lr.txt', 'w')
    lr_file.write(str(lr))
    lr_file.close()
    lr = theano.shared(numpy.array(lr).astype(theano.config.floatX))

    updates = []
    all_grads = theano.grad(model.costs[0], model.params)
    i = theano.shared(numpy.float32(1))
    i_t = i + 1.
    fix1 = 1. - (1. - beta1)**i_t
    fix2 = 1. - (1. - beta2)**i_t
    beta1_t = 1 - (1 - beta1) * gamma**(i_t - 1)
    lr_t = lr * (T.sqrt(fix2) / fix1)

    for p, g in zip(model.params, all_grads):
        m = theano.shared(
            numpy.zeros(p.get_value().shape, dtype=theano.config.floatX))
        v = theano.shared(
            numpy.zeros(p.get_value().shape, dtype=theano.config.floatX))

        m_t = (beta1_t * g) + ((1. - beta1_t) * m)
        v_t = (beta2 * g**2) + ((1. - beta2) * v)
        g_t = m_t / (T.sqrt(v_t) + epsilon)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    grad_and_cost = all_grads
    grad_and_cost.append(model.costs[0])
    train_grad_f = theano.function(model.inputs,
                                   grad_and_cost,
                                   on_unused_input='warn')
    train_update_params_f = theano.function(grad_and_cost[0:-1],
                                            None,
                                            updates=updates)
    if valid_set != None:
        valid_f = theano.function(model.inputs,
                                  model.costs,
                                  on_unused_input='warn')

    # create log file
    log_file = open(model_save_dir + 'log.txt', 'w')
    log_file.write('adam_optimizer\n')
    log_file.write('lr=%f, beta1=%f, beta2=%f, epsilon=%f, gamma=%f\n' %
                   (lr.get_value(), beta1, beta2, epsilon, gamma))
    log_file.close()

    print('... training with Adam optimizer')
    cap_count = 0
    train_cost = []
    t0 = time.clock()
    try:
        for u in range(n_iters):
            if u % 10 == 0:
                # refresh lr
                try:
                    lr_file = open(model_save_dir + '_lr.txt', 'r')
                    lr.set_value(float(lr_file.readline().rstrip()))
                    lr_file.close()
                except IOError:
                    pass

            grads = [
                numpy.zeros_like(p).astype(theano.config.floatX)
                for p in model.params
            ]
            mb_cost = []
            for i in train_set.iterate(True):
                tmp = train_grad_f(*i)
                new_grads = tmp[0:-1]
                mb_cost.append(tmp[-1])
                grads = [g1 + g2 for g1, g2 in zip(grads, new_grads)]
            grads = [g / numpy.array(minibatch) for g in grads]
            train_update_params_f(*grads)
            train_cost.append(numpy.mean(mb_cost))

            # output some information
            if u % disp_period == 0 and u > 0:
                p_now = numpy.concatenate(
                    [p.get_value().flatten() for p in model.params])
                if u < 4 * disp_period:
                    p_last = numpy.zeros_like(p_now)
                    delta_last = numpy.zeros_like(p_now)
                delta_now = p_now - p_last
                angle = numpy.arccos(
                    numpy.dot(delta_now, delta_last) /
                    numpy.linalg.norm(delta_now) /
                    numpy.linalg.norm(delta_last))
                angle = angle / numpy.pi * 180
                p_last = p_now
                delta_last = delta_now
                t1 = time.clock()
                print('period=%d, update=%d, mb_cost=[%.4f], |delta|=[%.2e], angle=[%.1f], lr=[%.6f], t=[%.2f]sec' % \
                      (u/valid_period, u, numpy.mean(train_cost), numpy.mean(abs(delta_now[0:10000])), angle, lr.get_value(), (t1-t0)))
                t0 = time.clock()
                train_cost = []

            if u % valid_period == 0 and u > 0:
                model.save_to_file(model_save_dir,
                                   total_period + (u) / valid_period)
                valid_loss = []
                valid_acc = []
                train_loss = []
                train_acc = []
                for i in valid_set.iterate(True):
                    loss, acc = valid_f(*i)
                    valid_loss.append(loss)
                    valid_acc.append(acc)
                for i in train_set.iterate(True):
                    loss, acc = valid_f(*i)
                    train_loss.append(loss)
                    train_acc.append(acc)
                cap_count += valid_period * minibatch
                output_info = 'period=%i, valid loss=[%.4f], valid acc=[%.4f], train loss=[%.4f], train acc=[%.4f]' % \
                              (u/valid_period, numpy.mean(valid_loss), numpy.mean(valid_acc), numpy.mean(train_loss), numpy.mean(train_acc))
                print(output_info)
                log_file = open(model_save_dir + 'log.txt', 'a')
                log_file.write(output_info + '\n')
                log_file.close()
    except KeyboardInterrupt:
        print('Training interrupted.')