Exemple #1
0
 def step(x_t,y_t,h_tm1,Wx,Wh,bh,Wy,by,lr,switch):
     h_t = relu(T.dot(x_t,Wx)+T.dot(h_tm1,Wh)+bh)
     yo_t = relu(T.dot(h_t,Wy)+by)
     
     updates = OrderedDict()
     
     # Train the RNN: backprop (loss + DNI output)
     loss = T.mean(T.square(yo_t-y_t))
     dni_out = self.dni.output(h_t)
     for param in self.params:
         dlossdparam = T.grad(loss,param)
         dniJ = T.Lop(h_t,param,dni_out,disconnected_inputs='ignore')
         updates[param] = param-lr*T.switch(T.gt(switch,0),
                                            dlossdparam+dniJ,
                                            dlossdparam)
                     
     # Update the DNI (from the last step)
     # re-calculate the DNI prediction from the last step
     # note: can't be passed through scan or T.grad won't work
     dni_out_old = self.dni.output(h_tm1)
     # dni_target: current loss backprop'ed + new dni backprop'ed
     dni_target = T.grad(loss,h_tm1) \
                  +T.Lop(h_t,h_tm1,dni_out)
     dni_error = T.sum(T.square(dni_out_old-dni_target))
     for param in self.dni.params:
         gparam = T.grad(dni_error,param)
         updates[param] = param-lr*gparam
     
     return [h_t,loss,dni_error],updates
Exemple #2
0
    def test_multiple_outputs(self):
        m = tensor.matrix('m')
        v = tensor.vector('v')
        m_ = tensor.matrix('m_')
        v_ = tensor.vector('v_')

        mval = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX)
        vval = self.rng.uniform(size=(7, )).astype(theano.config.floatX)
        m_val = self.rng.uniform(size=(3, 7)).astype(theano.config.floatX)
        v_val = self.rng.uniform(size=(7, )).astype(theano.config.floatX)

        rop_out1 = tensor.Rop([m, v, m + v], [m, v], [m_, v_])
        assert isinstance(rop_out1, list)
        assert len(rop_out1) == 3
        rop_out2 = tensor.Rop((m, v, m + v), [m, v], [m_, v_])
        assert isinstance(rop_out2, tuple)
        assert len(rop_out2) == 3
        lop_out1 = tensor.Lop([m, v, m + v], (m, v), [m_, v_])
        assert isinstance(lop_out1, tuple)
        assert len(lop_out1) == 2
        lop_out2 = tensor.Lop((m, v, m + v), [m, v], [m_, v_])
        assert isinstance(lop_out2, list)
        assert len(lop_out2) == 2

        all_outs = []
        for o in rop_out1, rop_out2, lop_out1, lop_out2:
            all_outs.extend(o)
        f = theano.function([m, v, m_, v_], all_outs)
        f(mval, vval, m_val, v_val)
Exemple #3
0
            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gc_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gc_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs
def hypergrad(params_ele,
              params_hyper,
              dvalid_dtheta,
              loss_ele,
              loss_hyper,
              loss_ele_penalty=0.):
    """ Function defining the hypergradients: gradients of validation cost
        with respect to various hyperparameters.     
    
        The function is separating penalty hyperparameters 
        (which is assumed to depend only on w) from noise and other hyperparameters,
        due to otherwise dependancy errors in the Lop operator.
        
        Inputs: 
        
        paramsT1, paramsT2 :: T1 and T2 parameters
        c1, c2 :: cross-entropy on training and validation set
        p1, p2 :: penalty terms on training and validation set (p2 assumed 0)
        
    """
    # initializations
    reg_penalty, reg_noise, grad_penalty, grad_noise, w, dvalid_dw = [], [], [], [], [], []

    # separate different types of parameters
    for regular in params_hyper:
        reg_type, _ = regular.name.split('_')
        if reg_type in penalty_list:
            reg_penalty += [regular]
        elif reg_type in noise_list:
            reg_noise += [regular]
        else:
            print 'Hypergrad not implemented for ', reg_type

    # separate weight parameters and gradients
    for (param, grad) in zip(params_ele, dvalid_dtheta):
        paramType, _ = param.name.split('_')
        if paramType == 'W':
            w += [param]
            dvalid_dw += [grad]

    # hyper-gradients
    if reg_penalty:
        dpenalty_dw = T.grad(loss_ele_penalty, w)
        dpenalty_dw = [-grad for grad in dpenalty_dw]
        grad_penalty = T.Lop(dpenalty_dw, reg_penalty, dvalid_dw)
    if reg_noise:
        dele_dtheta = T.grad(loss_ele, params_ele)
        dele_dtheta = [-grad for grad in dele_dtheta]
        grad_noise = T.Lop(dele_dtheta, reg_noise, dvalid_dtheta)

    # outputs
    params_hyper = reg_penalty + reg_noise
    dvalid_dgamma = grad_penalty + grad_noise

    return params_hyper, dvalid_dgamma
Exemple #5
0
def hypergrad(paramsT1, paramsT2, gradC2T1, c1, c2, p1=0., p2=0.):
    ''' Function defining the hypergradients: gradients of validation cost 
        with respect to various hyperparameters.     
    
        The function is separating penalty hyperparameters 
        (which is assumed to depend only on W) from noise and other hyperparameters,
        due to otherwise dependancy errors in the Lop operator.
        
        Inputs: 
        
        paramsT1, paramsT2 :: T1 and T2 parameters
        c1, c2 :: cross-entropy on training and validation set
        p1, p2 :: penalty terms on training and validation set (p2 assumed 0)
        
    '''
    # initializations
    rglrzPenal = []
    rglrzNoiz = []
    gradPenal = []
    gradNoiz = []
    W = []
    gradC2W = []

    # separate different types of parameters
    for rglrz in paramsT2:
        rglrzType, _ = rglrz.name.split('_')
        if rglrzType in penalList:
            rglrzPenal += [rglrz]
        elif rglrzType in noizList:
            rglrzNoiz += [rglrz]
        else:
            print 'Hypergrad not implemented for ', rglrzType

    # separate weight parameters and gradients
    for (param, grad) in zip(paramsT1, gradC2T1):
        paramType, _ = param.name.split('_')
        if paramType == 'W':
            W += [param]
            gradC2W += [grad]

    # hyper-gradients
    if rglrzPenal != []:
        gradPW = T.grad(p1, W)
        gradPW = [-grad for grad in gradPW]
        gradPenal = T.Lop(gradPW, rglrzPenal, gradC2W)
    if rglrzNoiz != []:
        gradE1T1 = T.grad(c1, paramsT1)
        gradE1T1 = [-grad for grad in gradE1T1]
        gradNoiz = T.Lop(gradE1T1, rglrzNoiz, gradC2T1)

    # outputs
    paramsT2 = rglrzPenal + rglrzNoiz
    gradC2T2 = gradPenal + gradNoiz

    return paramsT2, gradC2T2
    def Gvs(self, *args):
        # Contribution of hid_sig
        nw_args1 = TT.Lop(
            self.hid_sig, self.params,
            TT.Rop(self.hid_sig, self.params, args) /
            ((1 - self.hid_sig) * self.hid_sig * self.mbs))
        nw_args2 = TT.Lop(
            self.hid_sftmax, self.params,
            TT.Rop(self.hid_sftmax, self.params, args) /
            (self.hid_sftmax * self.mbs))

        return [x + y for x, y in zip(nw_args1, nw_args2)]
Exemple #7
0
def reinforce_no_baseline(params, policy, cost, lr, regularising_cost=None):
    """
    return reinforce updates
    @policy and @cost should be of shape (minibatch_size, 1)
    @policy should be the probability of the sampled actions
    """
    log_pol = T.log(policy)
    if regularising_cost is None:
        return [(i, i - lr * gi) for i, gi in zip(
            params, T.Lop(f=log_pol, wrt=params, eval_points=cost))]
    else:
        return [(i, i - lr * (gi + gr)) for i, gi, gr in zip(
            params, T.Lop(f=log_pol, wrt=params, eval_points=cost),
            T.grad(regularising_cost, params))]
Exemple #8
0
 def compute_Gv(*args):
     (hid_sig, hid_sftmax) = self.get_hiddens()
     nw_args1 = TT.Lop(
         hid_sig, self.params,
         TT.Rop(hid_sig, self.params, args) /
         ((1 - hid_sig) * hid_sig * self.batchsize))
     nw_args2 = TT.Lop(
         hid_sftmax, self.params,
         TT.Rop(hid_sftmax, self.params, args) /
         (hid_sftmax * self.batchsize))
     fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)]
     new_vals = safe_clone(fin_vals, [self.X, self.Y],
                           [self.loc_x, self.loc_y])
     return new_vals, {}
Exemple #9
0
 def setup(self, bottom, top):
     input = T.tensor4("input")
     v = T.matrix("v")
     result = T.sum(input, axis=(2, 3))
     result_g = T.Lop(result, input, v)
     self.f = theano.function([input], result)
     self.b = theano.function([input, v], result_g)
Exemple #10
0
            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
Exemple #11
0
def create_esgd_updates(updates, params, gparams, gsums, xsums, lr, eps, gamma,
                        momentum):
    has_momentum = momentum.get_value() > 0.0
    samples = [
        default_mrng.normal(size=p.shape,
                            avg=0,
                            std=1,
                            dtype=theano.config.floatX) for p in params
    ]
    HVs = T.Lop(gparams, params, samples)

    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omg_t = 1.0 - gamma**i_t
    for p, g, m, D, Hv in zip(params, gparams, gsums, xsums, HVs):
        if is_subtensor_op(p):
            raise Exception("ESGD subtensor update not implemented!")
        else:
            D_t = D * gamma + T.sqr(Hv) * (1.0 - gamma)
            if has_momentum:
                m_t = m * momentum + g
                updates[m] = m_t
            else:
                m_t = g
            g_t = m_t / (T.sqrt(D_t / omg_t + eps))
            updates[D] = D_t
            updates[p] = p - lr * g_t
    updates[i] = i_t
Exemple #12
0
            def compute_Ax(x):

                # There are three ways to compute the Fisher-vector product:

                # 1. https://github.com/joschu/modular_rl/blob/master/modular_rl/trpo.py#L54
                # Use theano.gradient.disconnected_grad and call theano.tensor.grad() twice.
                # WARNING: In our case (with the attention mechanism) it is extremly slow.

                # 2. http://deeplearning.net/software/theano/tutorial/gradients.html#hessian-times-a-vector
                # Use only theano.tensor.Rop, but you will need to calculate the fixed_output outside
                # of the compiled function, because disconnected_grad will not work with Rop.

                # 3. https://github.com/pascanur/natgrad/blob/master/model_convMNIST_standard.py
                # Rop devided by output because a metric F is based on gradient of log(output).
                # Here we also split the vector of parameters. Not checked, but it may be
                # faster then supply few vectors to minresQLP.

                xs = []
                offset = 0
                for p in params:
                    shape = p.get_value().shape
                    size = np.prod(shape)
                    xs.append(x[offset:offset + size].reshape(shape))
                    offset += size

                jvp = T.Rop(new_output, params, xs) / (
                    new_output * self.batch_size * self.history + TINY)
                fvp = T.Lop(new_output, params, jvp)
                fvp = T.concatenate([g.flatten() for g in fvp])

                return [fvp], {}
Exemple #13
0
    def __init__(self, t_cost, t_traj_info, t_inputs, params, reg=1e-5):
        t_new_params = [
            _np2theano(p.name, p.get_value(borrow=True)) for p in params
        ]

        t_mean = t_traj_info['act_mean']
        t_mean = t_mean.reshape((-1, t_mean.shape[-1]))
        t_logstd = t_traj_info['act_logstd']
        t_logstd = t_logstd.reshape((-1, t_logstd.shape[-1]))
        t_new_mean = t_traj_info['new_act_mean']
        t_new_mean = t_new_mean.reshape((-1, t_new_mean.shape[-1]))
        t_new_logstd = t_traj_info['new_act_logstd']
        t_new_logstd = t_new_logstd.reshape((-1, t_new_logstd.shape[-1]))

        print 'Compiling cost function ... ',
        s = time()
        self.cost = theano.function(inputs=t_inputs,
                                    outputs=t_cost,
                                    on_unused_input='ignore')
        print 'finished in %f seconds' % (time() - s)

        print 'Building cost grad function ... ',
        s = time()
        _t_cost_grad = T.grad(-t_cost, wrt=params)
        print 'finished in %f seconds' % (time() - s)

        print 'Compiling cost grad function ... ',
        s = time()
        self._cost_grad = theano.function(inputs=t_inputs,
                                          outputs=[t_cost] + _t_cost_grad,
                                          on_unused_input='ignore')
        print 'finished in %f seconds' % (time() - s)

        print 'Building Hx function ... ',
        s = time()
        mu = T.concatenate([t_new_mean, t_new_logstd], axis=-1)
        Jx = sum([T.Rop(mu, p, x) for (p, x) in zip(params, t_new_params)])
        M = T.tile(T.eye(2), (mu.shape[0], 1, 1))
        Jx = Jx.reshape((Jx.shape[0], Jx.shape[1], 1))
        Jx = T.tile(Jx, (1, 1, Jx.shape[1]))
        MJx = Jx
        JMJx = [
            T.Lop(MJx, p, x, disconnected_inputs='ignore')
            for (p, x) in zip(params, t_new_params)
        ]
        Hx = [h + reg * p for (h, p) in zip(JMJx, t_new_params)]
        print 'finished in %f seconds' % (time() - s)

        # TODO: Use mask to handle  different lengths.

        print 'Compiling Hx function ...',
        s = time()
        self._constraint_Hx = theano.function(inputs=t_inputs + t_new_params,
                                              outputs=Hx,
                                              on_unused_input='ignore')

        self.constraint_Hx = lambda inputs, params: self._constraint_Hx(*(
            inputs + params))
        print 'finished in %f seconds' % (time() - s)
Exemple #14
0
 def mean_weighted_grad(weights, loss):
     # Lop to the rescue! Here I was calling T.jacobian and trying to
     # broadcast things and elementwise-multiply through the resulting lists,
     # when a function already existed to do all of that for me...
     return T.Lop(loss,
                  params,
                  weights / T.cast(weights.shape[0], 'float32'),
                  disconnected_inputs='ignore')
Exemple #15
0
    def check_rop_lop(self, y, out_shape):
        """
        As check_mat_rop_lop, except the input is self.x which is a
        vector. The output is still a vector.
        """
        # TEST ROP
        vx = np.asarray(self.rng.uniform(size=self.in_shape),
                        theano.config.floatX)
        vv = np.asarray(self.rng.uniform(size=self.in_shape),
                        theano.config.floatX)

        yv = tensor.Rop(y, self.x, self.v)
        rop_f = function([self.x, self.v], yv, on_unused_input="ignore")
        J, _ = theano.scan(
            lambda i, y, x: tensor.grad(y[i], x),
            sequences=tensor.arange(y.shape[0]),
            non_sequences=[y, self.x],
        )
        sy = tensor.dot(J, self.v)

        scan_f = function([self.x, self.v], sy, on_unused_input="ignore")

        v1 = rop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert np.allclose(v1, v2), "ROP mismatch: %s %s" % (v1, v2)

        known_fail = False
        try:
            tensor.Rop(theano.clone(y, replace={self.x: break_op(self.x)}),
                       self.x, self.v)
        except ValueError:
            known_fail = True

        # TEST LOP

        vx = np.asarray(self.rng.uniform(size=self.in_shape),
                        theano.config.floatX)
        vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)

        yv = tensor.Lop(y, self.x, self.v)
        lop_f = function([self.x, self.v], yv, on_unused_input="ignore")
        J, _ = theano.scan(
            lambda i, y, x: tensor.grad(y[i], x),
            sequences=tensor.arange(y.shape[0]),
            non_sequences=[y, self.x],
        )
        sy = tensor.dot(self.v, J)

        scan_f = function([self.x, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert np.allclose(v1, v2), "LOP mismatch: %s %s" % (v1, v2)

        if known_fail:
            pytest.skip("Rop does not handle non-differentiable inputs "
                        "correctly. Bug exposed by fixing Add.grad method.")
Exemple #16
0
 def setup(self, bottom, top):
     import theano.tensor as T
     import theano
     x = T.dvector('x')
     v = T.dvector('v')
     y = x * 2
     yg = T.Lop(y, x, v)
     self.f = theano.function([x], y)
     self.b = theano.function([x, v], yg, on_unused_input='warn')
Exemple #17
0
def test_rop_lop():
    mx = tensor.matrix('mx')
    mv = tensor.matrix('mv')
    v = tensor.vector('v')
    y = matrix_inverse(mx).sum(axis=0)

    yv = tensor.Rop(y, mx, mv)
    yv2 = tensor.Rop_via_Lop(y, mx, mv)
    rop_f = function([mx, mv], [yv, yv2])

    sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
                        sequences=tensor.arange(y.shape[0]),
                        non_sequences=[y, mx, mv])
    scan_f = function([mx, mv], sy)

    rng = np.random.RandomState(utt.fetch_seed())
    vx = np.asarray(rng.randn(4, 4), theano.config.floatX)
    vv = np.asarray(rng.randn(4, 4), theano.config.floatX)

    v1 = scan_f(vx, vv)
    v2, v3 = rop_f(vx, vv)

    assert _allclose(v2, v1), ('Rop mismatch: %s %s' % (v2, v1))
    assert _allclose(v3, v1), ('Rop_via_Lop mismatch: %s %s' % (v3, v1))

    raised = False
    try:
        tensor.Rop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv)
    except ValueError:
        raised = True
    if not raised:
        raise Exception(('Op did not raised an error even though the function'
                         ' is not differentiable'))

    try:
        tensor.Rop_via_Lop(theano.clone(y, replace={mx: break_op(mx)}), mx, mv)
    except theano.gradient.NullTypeGradError:
        raised = True
    except theano.gradient.DisconnectedInputError:
        raised = True

    if not raised:
        raise Exception((
            'Rop_via_Lop for Op did not raise an error even though the function'
            ' is not differentiable'))

    vv = np.asarray(rng.uniform(size=(4, )), theano.config.floatX)
    yv = tensor.Lop(y, mx, v)
    lop_f = function([mx, v], yv)

    sy = tensor.grad((v * y).sum(), mx)
    scan_f = function([mx, v], sy)

    v1 = lop_f(vx, vv)
    v2 = scan_f(vx, vv)
    assert _allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
Exemple #18
0
    def check_rop_lop(self, y, out_shape):
        """
        As check_mat_rop_lop, except the input is self.x which is a
        vector. The output is still a vector.

        """
        # TEST ROP
        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)

        yv = tensor.Rop(y, self.x, self.v)
        rop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(J, self.v)

        scan_f = function([self.x, self.v], sy, on_unused_input='ignore')

        v1 = rop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2))
        known_fail = False
        try:
            self.check_nondiff_rop(
                theano.clone(y, replace={self.x: break_op(self.x)}))
        except AssertionError:
            known_fail = True

        # TEST LOP

        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=out_shape),
                           theano.config.floatX)

        yv = tensor.Lop(y, self.x, self.v)
        lop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(self.v, J)

        scan_f = function([self.x, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))

        if known_fail:
            raise KnownFailureTest(
                "Rop doesn't handle non-differentiable "
                "inputs correctly. Bug exposed by fixing Add.grad"
                " method.")
Exemple #19
0
    def __init__(self, p, inputs, s, costs):
        # useful data for reshaping
        self.shapes = [i.get_value().shape for i in p]
        self.sizes = map(np.prod, self.shapes)
        self.positions = np.cumsum([0] + self.sizes)[:-1]

        self.p = p
        self.inputs = inputs
        self.s = s
        self.costs = costs

        g = T.grad(costs[0], p)
        g = map(T.as_tensor_variable, g)  # for CudaNdarray
        self.f_gc = theano.function(inputs, g + costs)  # gradient computation
        self.f_cost = theano.function(inputs, costs)  # quick cost evaluation

        symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4

        coefficient = T.scalar()  # this is lambda*mu

        # this computes the product Gv = J'HJv (G is the Gauss-Newton matrix)
        v = [symbolic_types[len(i)]() for i in self.shapes]
        Jv = T.Rop(s, p, v)
        HJv = T.grad(T.sum(T.grad(costs[0], s) * Jv),
                     s,
                     consider_constant=[Jv])
        Gv = T.grad(T.sum(HJv * s), p, consider_constant=[HJv, Jv])
        Gv = map(T.as_tensor_variable, Gv)  # for CudaNdarray
        self.function_Gv = theano.function(inputs + v + [coefficient],
                                           Gv,
                                           givens={},
                                           on_unused_input='ignore')
        # compute J'sqrt(diag(H))v for jacobi preconditioner
        r = T.matrix()
        sqrt_Hv = T.sqrt(T.grad(T.sum(T.grad(costs[0], s)), s)) * r
        J_sqrt_Hv = T.Lop(s, p, sqrt_Hv)
        J_sqrt_Hv = map(T.as_tensor_variable, J_sqrt_Hv)  # for CudaNdarray

        self.function_J_sqrt_Hv = theano.function(inputs + [r],
                                                  J_sqrt_Hv,
                                                  givens={},
                                                  on_unused_input='ignore')
        # compute Hv
        dp = T.grad(costs[0], p)
        total = 0
        for dp_, v_ in zip(dp, v):
            total += T.sum(dp_ * v_)

        Hv = T.grad(total, p)
        Hv = map(T.as_tensor_variable, Hv)  # for CudaNdarray
        self.function_Hv = theano.function(inputs + v + [coefficient],
                                           Hv,
                                           on_unused_input='ignore')
Exemple #20
0
    def check_mat_rop_lop(self, y, out_shape):
        """
        Test the Rop/Lop when input is a matrix and the output is a vector

        :param y: the output variable of the op applied to self.mx
        :param out_shape: Used to generate a random tensor
                          corresponding to the evaluation point of the Rop
                          (i.e. the tensor with which you multiply the
                          Jacobian). It should be a tuple of ints.

        If the Op has more than 1 input, one of them must be mx, while
        others must be shared variables / constants. We will test only
        against the input self.mx, so you must call
        check_mat_rop_lop/check_rop_lop for the other inputs.

        We expect all inputs/outputs have dtype floatX.

        If you want to test an Op with an output matrix, add a sum
        after the Op you want to test.
        """
        vx = np.asarray(self.rng.uniform(size=self.mat_in_shape),
                        theano.config.floatX)
        vv = np.asarray(self.rng.uniform(size=self.mat_in_shape),
                        theano.config.floatX)
        yv = tensor.Rop(y, self.mx, self.mv)
        yv2 = tensor.Rop_via_Lop(y, self.mx, self.mv)
        rop_f = function([self.mx, self.mv], [yv, yv2],
                         on_unused_input='ignore')
        sy, _ = theano.scan(lambda i, y, x, v:
                            (tensor.grad(y[i], x) * v).sum(),
                            sequences=tensor.arange(y.shape[0]),
                            non_sequences=[y, self.mx, self.mv])
        scan_f = function([self.mx, self.mv], sy, on_unused_input='ignore')

        v1, v2 = rop_f(vx, vv)
        v3 = scan_f(vx, vv)

        assert np.allclose(v1, v3), ('ROP mismatch: %s %s' % (v1, v3))
        assert np.allclose(v2, v3), ('ROP_VIA_LOP mismatch: %s %s' % (v2, v3))

        self.check_nondiff_rop(
            theano.clone(y, replace={self.mx: break_op(self.mx)}))

        vv = np.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
        yv = tensor.Lop(y, self.mx, self.v)
        lop_f = function([self.mx, self.v], yv)

        sy = tensor.grad((self.v * y).sum(), self.mx)
        scan_f = function([self.mx, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert np.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
Exemple #21
0
 def _get_updates_for(self, param, grad):
     D_tm1 = shared_like(param, 'D_ewma')
     v = self.rng.normal(param.shape)
     if self.hv_method == 'rop':
         Hv = TT.Rop(grad, param, v)
     if self.hv_method == 'lop':
         Hv = TT.Lop(grad, param, v)
     if self.hv_method == 'grad':
         Hv = TT.grad(TT.sum(grad * v), param)
     D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv
     den = TT.sqrt(D_t) + self.epsilon
     yield D_tm1, D_t
     yield param, param - grad * self.learning_rate / den
Exemple #22
0
    def setup(self, bottom, top):
        weights = T.matrix("weights")
        weights_bc = weights.dimshuffle((0, 1, "x", "x"))
        feats = T.tensor4("weights")
        v = T.tensor3("v")

        dot = weights_bc * feats
        result = T.sum(dot, axis=1)

        g_w, g_f = T.Lop(result, [weights, feats], v)
        self.f = theano.function([weights, feats], result)
        self.b_w = theano.function([weights, feats, v], g_w)
        self.b_f = theano.function([weights, feats, v], g_f)
Exemple #23
0
    def setup(self, bottom, top):
        small_size = bottom[0].shape[1]
        small = T.matrix("small")
        big = T.tensor4("big")
        v = T.tensor4("v")
        small_bc = small.dimshuffle(0, 1, "x", "x")
        small_bc = T.addbroadcast(small_bc, 0)
        result = big + small_bc

        g_small, g_big = T.Lop(result, [small, big], v)
        self.f = theano.function([small, big], result)
        self.b_small = theano.function([v], g_small)
        self.b_big = theano.function([v], g_big)
Exemple #24
0
            def step(x_t, y_t, h_tmT, Wx, Wh, bh, Wy, by, lr, switch):

                # manually build the graph for the inner loop...
                # passing correct h_tm1 is impossible in nested scans
                yo_t = []
                h_tm1 = h_tmT
                for t in range(self.steps):
                    h_t = relu(T.dot(x_t[t], Wx) + T.dot(h_tm1, Wh) + bh)
                    yo_t.append(relu(T.dot(h_t, Wy) + by))
                    h_tm1 = h_t

                updates = OrderedDict()

                # Train the RNN: backprop (loss + DNI output)
                loss = T.mean(T.square(yo_t - y_t))
                dni_out = self.dni.output(h_t)
                for param in self.params:
                    dlossdparam = T.grad(loss, param)
                    dniJ = T.Lop(h_t,
                                 param,
                                 dni_out,
                                 disconnected_inputs='ignore')
                    updates[param] = param - lr * T.switch(
                        T.gt(switch, 0), dlossdparam + dniJ, dlossdparam)

                # Update the DNI (from the last step)
                # re-calculate the DNI prediction from the last step
                # note: can't be passed through scan or T.grad won't work
                dni_out_old = self.dni.output(h_tmT)
                # dni_target: current loss backprop'ed + new dni backprop'ed
                dni_target = T.grad(loss,h_tmT) \
                             +T.Lop(h_t,h_tmT,dni_out)
                dni_error = T.sum(T.square(dni_out_old - dni_target))
                for param in self.dni.params:
                    gparam = T.grad(dni_error, param)
                    updates[param] = param - lr * gparam

                return [h_t, loss, dni_error], updates
Exemple #25
0
 def gauss_vect_mult(v):
     """
     Multiply a vector by the Gauss-Newton matrix JHJ'
       where J is the Jacobian between output and params and H is the Hessian between costs and output
       H should be diagonal and positive.
     Also add the ridge
     """
     Jv = T.Rop(output, params, v)
     HJv = T.Rop(T.grad(opt_cost, output), output, Jv)
     JHJv = T.Lop(output, params, HJv)
     if not isinstance(JHJv, list):
         JHJv = [JHJv]
     JHJv = [a + ridge * b for a, b in zip(JHJv, v)]
     return JHJv
Exemple #26
0
 def setup(self, bottom, top):
     attention = T.tensor4("attention")
     input = T.tensor4("input")
     v = T.matrix("v")
     attention_bc = T.addbroadcast(attention, 1)
     attended = T.mul(input, attention_bc)
     result = T.sum(attended, axis=(2, 3))
     result_g_attention, result_g_input = T.Lop(result, [attention, input],
                                                v)
     self.f = theano.function([attention, input], result)
     self.b_attention = theano.function([attention, input, v],
                                        result_g_attention)
     self.b_input = theano.function([attention, input, v],
                                    result_g_attention)
Exemple #27
0
    def parse_args(self, bottom, top):
        function_str = self.pythonargs[0]
        top_shape = self.pythonargs[1]

        old_function_str = self.function_str
        old_top_shape = self.top_shape
        self.function_str = function_str
        self.top_shape = top_shape
        if function_str != old_function_str or len(top_shape) != len(
                old_top_shape):
            if old_function_str != '':
                print(
                    'TheanoGPU function string different from cache: recompiling'
                )
            import theano.tensor as T
            import theano
            from theano.sandbox.cuda.basic_ops import gpu_from_host
            x = []
            for i in range(len(bottom)):
                if len(bottom[i].shape) == 1:
                    x.append(T.vector('x%d' % i))
                if len(bottom[i].shape) == 2:
                    x.append(T.matrix('x%d' % i))
                if len(bottom[i].shape) == 3:
                    x.append(T.tensor3('x%d' % i))
                if len(bottom[i].shape) == 4:
                    x.append(T.tensor4('x%d' % i))

            y = eval(function_str)
            self.f = theano.function(x,
                                     gpu_from_host(y),
                                     on_unused_input='ignore')

            if len(self.top_shape) == 1:
                v = T.vector('v')
            elif len(self.top_shape) == 2:
                v = T.matrix('v')
            elif len(self.top_shape) == 3:
                v = T.tensor3('v')
            elif len(self.top_shape) == 4:
                v = T.tensor4('v')
            self.b = []
            for i in range(len(bottom)):
                yg = T.Lop(y, x[i], v)
                self.b.append(
                    theano.function(x + [v],
                                    gpu_from_host(yg),
                                    on_unused_input='ignore'))
Exemple #28
0
    def check_rop_lop(self, y, out_shape):
        """
        As check_mat_rop_lop, except the input is self.x witch is a
        vector. The output is still a vector.

        """
        # TEST ROP
        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)

        yv = tensor.Rop(y, self.x, self.v)
        rop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(J, self.v)

        scan_f = function([self.x, self.v], sy, on_unused_input='ignore')

        v1 = rop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('ROP mismatch: %s %s' % (v1, v2))
        self.check_nondiff_rop(
            theano.clone(y, replace={self.x: break_op(self.x)}))

        # TEST LOP

        vx = numpy.asarray(self.rng.uniform(size=self.in_shape),
                           theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=out_shape),
                           theano.config.floatX)

        yv = tensor.Lop(y, self.x, self.v)
        lop_f = function([self.x, self.v], yv, on_unused_input='ignore')
        J, _ = theano.scan(lambda i, y, x: tensor.grad(y[i], x),
                           sequences=tensor.arange(y.shape[0]),
                           non_sequences=[y, self.x])
        sy = tensor.dot(self.v, J)

        scan_f = function([self.x, self.v], sy)

        v1 = lop_f(vx, vv)
        v2 = scan_f(vx, vv)
        assert numpy.allclose(v1, v2), ('LOP mismatch: %s %s' % (v1, v2))
Exemple #29
0
def compute_Lx(energies, params, deltas):
    # expectations and derivatives are commutative.
    cenergies = energies - T.mean(energies)
    Minv = T.cast(1. / energies.shape[0], floatX)

    rhs_terms = []
    for param_j, delta_j in zip(params, deltas):
        rhs_term = T.Rop(cenergies, param_j, delta_j)
        rhs_terms += [rhs_term]

    Lx_terms = []
    for param_i in params:
        Lx_term = 0
        for rhs in rhs_terms:
            Lx_term += Minv * T.Lop(cenergies, param_i, rhs)
        Lx_terms += [Lx_term]
    return Lx_terms
Exemple #30
0
                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               cgv))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs