Beispiel #1
0
 def compute_Gv(*args):
     (hid_sig, hid_sftmax) = self.get_hiddens()
     nw_args1 = TT.Lop(
         hid_sig, self.params,
         TT.Rop(hid_sig, self.params, args) /
         ((1 - hid_sig) * hid_sig * self.batchsize))
     nw_args2 = TT.Lop(
         hid_sftmax, self.params,
         TT.Rop(hid_sftmax, self.params, args) /
         (hid_sftmax * self.batchsize))
     fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)]
     new_vals = safe_clone(fin_vals, [self.X, self.Y],
                           [self.loc_x, self.loc_y])
     return new_vals, {}
Beispiel #2
0
 def compute_Gv(*args):
     (hid_sig, hid_sftmax) = self.get_hiddens()
     nw_args1 = TT.Lop(hid_sig,
                      self.params,
                      TT.Rop(hid_sig,
                             self.params,
                             args)/((1-hid_sig)*hid_sig*self.batchsize))
     nw_args2 = TT.Lop(hid_sftmax,
                      self.params,
                      TT.Rop(hid_sftmax,
                             self.params,
                             args)/(hid_sftmax*self.batchsize))
     fin_vals = [x+y for x,y in zip(nw_args1, nw_args2)]
     new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x,
                                                        self.loc_y])
     return new_vals, {}
Beispiel #3
0
    def __init__(self,
                 model,
                 state,
                 data):
        """
        Parameters:
            :param model:
                Class describing the model used.  It should provide the
                 computational graph to evaluate the model
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        n_params = len(model.params)
        cbs = state['cbs']
        bs = state['bs']
        ebs = state['ebs']
        mbs = state['mbs']
        profile = state['profile']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        self.damping = theano.shared(numpy.float32(state['damp']))

        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.loop_inps = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.loop_outs = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.step = 0
        self.cbs = cbs
        self.bs = bs
        self.ebs = ebs
        self.mbs = mbs
        self.state = state
        self.profile = profile
        self.data = data
        self.step_timer = time.time()

        ############################################################
        # Step 1. Compile function for computing eucledian gradients
        ############################################################
        print 'Constructing grad function'
        bdx = TT.iscalar('batch_idx')
        loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in
                    self.data.variables]
        cost = safe_clone(model.train_cost, model.inputs, loc_data)
        gs = TT.grad(cost, model.params)
        ratio = numpy.float32(float(bs) / cbs)
        update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)]

        print 'Compiling grad function'
        st = time.time()
        self.loc_grad_fn = theano.function(
            [bdx ],
            [],
            updates=update, name='loc_fn_grad',
            profile=profile)
        print 'took', time.time() - st

        #############################################################
        # Step 2. Compile function for Computing Riemannian gradients
        #############################################################
        loc_x = self.data._natgrad[bdx*cbs: (bdx+1)*cbs]
        loc_y = self.data._natgrady[bdx*cbs:(bdx+1)*cbs]
        loc_Gvs = safe_clone(model.Gvs(*self.loop_inps), [model.X, model.Y],
                             [loc_x, loc_y])
        updates = [(l, l + lg) for l, lg in zip(self.loop_outs, loc_Gvs)]
        st = time.time()
        loc_Gv_fn = theano.function(
            [bdx], [], updates=updates, name='loc_fn_rop', profile=profile)
        print 'took', time.time() - st

        def compute_Gv(*args):
            rval = forloop(loc_Gv_fn,
                           mbs // cbs,
                           self.loop_inps,
                           self.loop_outs)(*args)
            return rval, {}

        print 'Constructing riemannian gradient function'
        st = time.time()
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        if not state['minresQLP']:
            self.msgs = minres_messages
            rvals = minres(compute_Gv,
                           [x / norm_grads for x in self.gs],
                           rtol=state['mrtol'],
                           damp=self.damping,
                           maxit=state['miters'],
                           profile=state['profile'])
        else:
            self.msgs = minresQLP_messages[1:]
            rvals = minresQLP(compute_Gv,
                              [x / norm_grads for x in self.gs],
                              model.params_shape,
                              rtol=state['mrtol'],
                              damp=self.damping,
                              maxit=state['miters'],
                              TranCond=state['trancond'],
                              profile=state['profile'])

        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = TT.cast(rvals[1], 'int32')
        niters = rvals[2]
        rel_residual = rvals[3]
        Anorm = rvals[4]
        Acond = rvals[5]

        norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs))
        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))
        updates = zip(self.rs, nw_rs)
        print 'took', time.time() - st
        print 'Compiling riemannian gradient function'
        st = time.time()
        self.compute_natural_gradients = theano.function(
            [],
            [flag, niters, rel_residual, Anorm, Acond,
             norm_grads, norm_rs_grads, norm_ord0],
            updates=updates,
            allow_input_downcast = True,
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            profile=profile)
        print 'took', time.time() - st
        ###########################################################
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        ###########################################################
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(state['lr'])
        loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in
                    self.data.variables]
        old_cost = safe_clone(model.train_cost, model.inputs, loc_data)
        self.loc_old_cost = theano.function(
            [bdx], old_cost, name='loc_old_cost', profile=profile)
        new_params = [p - lr * r for p, r in zip(model.params, self.rs)]
        new_cost = safe_clone(model.train_cost,
                              model.inputs + model.params,
                              loc_data + new_params)
        new_err = safe_clone(model.error,
                             model.inputs + model.params,
                             loc_data + new_params)
        self.loc_new_cost = theano.function(
            [bdx, lr], [new_cost, new_err], name='loc_new_cost',
            profile=profile)

        self.lr = numpy.float32(state['lr'])
        updates = dict(zip(model.params, new_params))
        model.dbm_class.censor_updates(updates)
        self.update_params = theano.function(
            [lr], [], updates=updates,
            name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        p_norm = TT.scalar('p_norm')
        prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \
        #        TT.sqrt(sum(TT.sum(r*r) for r in self.rs))
        dist = -lr * prod
        angle = prod / p_norm
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function(
            [old_cost, new_cost, lr, p_norm], [rho, dist, angle], name='compute_rho', profile=profile)
        self.old_cost = 1e20
        self.__new_cost = 0
        self.__error = 0
        self.return_names = ['cost',
                             'old_cost',
                             'error',
                             'time_grads',
                             'time_metric',
                             'time_eval',
                             'minres_flag',
                             'minres_iters',
                             'minres_relres',
                             'minres_Anorm',
                             'minres_Acond',
                             'norm_ord0',
                             'norm_grad',
                             'norm_nat',
                             'lr',
                             'grad_angle',
                             #'r_g',
                             #'icost',
                             'damping',
                             'rho'
                            ]
Beispiel #4
0
    def __init__(self,
                 model,
                 state,
                 data):
        """
        Parameters:
            :param model:
                Class describing the model used.  It should provide the
                 computational graph to evaluate the model
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        n_params = len(model.params)
        cbs = state['cbs']
        bs = state['bs']
        mbs = state['mbs']
        ebs = state['ebs']
        profile = state['profile']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        srng = RandomStreams(self.rng.randint(213))

        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.loop_inps = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.loop_outs = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.step = 0
        self.cbs = cbs
        self.bs = bs
        self.mbs = mbs
        self.ebs = ebs
        self.state = state
        self.profile = profile
        self.data = data
        self.step_timer = time.time()

        ############################################################
        # Step 1. Compile function for computing eucledian gradients
        ############################################################
        print 'Constructing grad function'
        bdx = TT.iscalar('batch_idx')
        loc_data = [x(bdx * cbs, (bdx + 1) * cbs) for x in
                    self.data.variables]
        cost = safe_clone(model.train_cost, model.inputs, loc_data)
        gs = TT.grad(cost, model.params)
        ratio = numpy.float32(float(bs) / cbs)
        update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)]
        print 'Compiling grad function'
        st = time.time()
        self.loc_grad_fn = theano.function(
            [bdx], [], updates=update, name='loc_fn_grad', profile=profile)
        print 'took', time.time() - st

        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        ###########################################################
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        ###########################################################
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(state['lr'])
        loc_data = [x(bdx * cbs, (bdx + 1) * cbs) for x in
                    self.data.variables]
        old_cost = safe_clone(model.train_cost, model.inputs, loc_data)
        self.loc_old_cost = theano.function(
            [bdx], old_cost, name='loc_old_cost', profile=profile)
        new_params = [p - lr * r for p, r in zip(model.params, self.gs)]
        new_cost = safe_clone(model.train_cost,
                              model.inputs + model.params,
                              loc_data + new_params)
        new_err = safe_clone(model.error,
                             model.inputs + model.params,
                             loc_data + new_params)
        self.loc_new_cost = theano.function(
            [bdx, lr], [new_cost, new_err], name='loc_new_cost',
            profile=profile)

        loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in
                    self.data.eval_variables]
        new_cost = safe_clone(model.train_cost,
                              model.inputs + model.params,
                              loc_data + new_params)
        new_err = safe_clone(model.error,
                             model.inputs + model.params,
                             loc_data + new_params)
        self.loc_new_cost_all = theano.function(
            [bdx, lr], [new_cost, new_err], name='loc_new_cost',
            profile=profile)
        self.update_params = theano.function(
            [lr], [], updates=zip(model.params, new_params),
            name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        dist = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.gs)])
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function(
            [old_cost, new_cost, lr], [rho, norm_grads], name='compute_rho', profile=profile)
        self.old_cost = 1e20
        self.return_names = ['cost',
                             'error',
                             'time_grads',
                             'time_eval',
                             'norm_grad',
                             'rho',
                             'lr']
Beispiel #5
0
    def __init__(self, model, state, data):
        """
        Parameters:
            :param model:
                Class describing the model used.  It should provide the
                 computational graph to evaluate the model
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        n_params = len(model.params)
        bs = state['bs']
        profile = state['profile']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])

        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]

        self.bs = bs
        self.state = state
        self.profile = profile
        self.data = data
        self.data.set_iterator(order='sequence',
                               rng=self.rng,
                               batchsize=self.state['bs'])
        self.data_iter = data.__iter__()
        self.step_timer = time.time()

        ############################################################
        # Step 1. Compile function for computing eucledian gradients
        ############################################################
        print 'Constructing grad function'
        gs = TT.grad(self.model.train_cost, model.params)
        update = [(g, lg) for g, lg in zip(self.gs, gs)]
        print 'Compiling grad function'
        st = time.time()
        self.grad_fn = theano.function(self.model.inputs, [],
                                       updates=update,
                                       name='loc_fn_grad',
                                       profile=profile)
        print 'took', time.time() - st

        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        ###########################################################
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        ###########################################################
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(state['lr'])
        old_cost = model.train_cost
        self.compute_old_cost = theano.function(self.model.inputs,
                                                old_cost,
                                                name='loc_old_cost',
                                                profile=profile)
        new_params = [p - lr * r for p, r in zip(model.params, self.gs)]
        new_cost = safe_clone(model.train_cost, model.params, new_params)
        new_err = safe_clone(model.error, model.params, new_params)
        self.compute_new_cost = theano.function([lr] + self.model.inputs,
                                                [new_cost, new_err],
                                                name='loc_new_cost',
                                                profile=profile)

        self.update_params = theano.function([lr], [],
                                             updates=zip(
                                                 model.params, new_params),
                                             name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        dist = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.gs)])
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function([old_cost, new_cost, lr],
                                           [rho, norm_grads],
                                           name='compute_rho',
                                           profile=profile)
        self.old_cost = 1e20
        self.step = 0
        self.return_names = [
            'cost', 'error', 'time_grads', 'time_eval', 'norm_grad', 'rho',
            'lr'
        ]
Beispiel #6
0
    def __init__(self,
                 X,
                 Y,
                 dbm,
                 cost,
                 batchsize=200,
                 init_damp = 5.,
                 min_damp = .001,
                 damp_ratio = 5./4.,
                 mrtol = 1e-4,
                 miters = 100,
                 trancond = 1e-4,
                 lr = .1,
                 adapt_rho = 1):
        """
        X: theano design matrix of inputs
        Y: theano design matrix of features
        batchsize: int, describing the batch size
        init_damp: float, initial damping value
        min_damp: float, minimal damping value allowed
        damp_ratio: float, ratio used to increase damping (we decrease by
                    1./ratio)
        mrtol: float, relative tolerance error for the inversion of the metric
        miters: int, maximal number of iteration for minres
        trancond: float, (ignore) threshold for switching from MinresQLP to Minres
        lr : float/shared variable; learning rate
        adapt_rho : 0 or 1, if the damping should be heuristically adapted
        """
        self.batchsize = batchsize
        self.adapt_rho = adapt_rho
        self.damp_ratio = damp_ratio
        self.min_damp = min_damp

        self.dbm = dbm
        self.cost = cost

        self.X = X
        self.Y = Y

        descr = self.cost.get_fixed_var_descr(self.dbm, X, Y)

        self._on_load_batch = descr.on_load_batch[0]

        self.drop_mask = descr.fixed_vars['drop_mask']
        self.drop_mask_Y = descr.fixed_vars['drop_mask_Y']
        self.params = self.get_params()
        self.params_shape = [x.get_value(borrow=True).shape for x in
                             self.params]

        self.damping = theano.shared(numpy.float32(init_damp))

        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in self.params_shape]
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in self.params_shape]

        cost = self.get_cost()
        gs = TT.grad(cost, self.params)
        self.loc_grad_fn = theano.function([self.X, self.Y],
                                           [],
                                           updates = zip(self.gs, gs),
                                           name = 'loc_fn_grad')

        ### ### ### ### ### ### ###
        self.loc_x = theano.shared(numpy.zeros((20,784), dtype='float32'))
        self.loc_y = theano.shared(numpy.zeros((20,10), dtype='float32'))

        def compute_Gv(*args):
            (hid_sig, hid_sftmax) = self.get_hiddens()
            nw_args1 = TT.Lop(hid_sig,
                             self.params,
                             TT.Rop(hid_sig,
                                    self.params,
                                    args)/((1-hid_sig)*hid_sig*self.batchsize))
            nw_args2 = TT.Lop(hid_sftmax,
                             self.params,
                             TT.Rop(hid_sftmax,
                                    self.params,
                                    args)/(hid_sftmax*self.batchsize))
            fin_vals = [x+y for x,y in zip(nw_args1, nw_args2)]
            new_vals = safe_clone(fin_vals, [self.X, self.Y], [self.loc_x,
                                                               self.loc_y])
            return new_vals, {}


        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        self.msgs = minresQLP_messages[1:]
        rvals = minresQLP(compute_Gv,
                          [x / norm_grads for x in self.gs],
                          self.params_shape,
                          rtol=mrtol,
                          damp=self.damping,
                          maxit=miters,
                          TranCond=trancond)
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = TT.cast(rvals[1], 'int32')
        niters = rvals[2]
        rel_residual = rvals[3]
        Anorm = rvals[4]
        Acond = rvals[5]

        norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs))
        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))
        updates = zip(self.rs, nw_rs)
        self.compute_natural_gradients = theano.function(
            [],
            [flag, niters, rel_residual, Anorm, Acond,
             norm_grads, norm_rs_grads, norm_ord0],
            updates=updates,
            allow_input_downcast = True,
            name='compute_riemannian_gradients',
            on_unused_input='warn')

        self.loc_old_cost = theano.function(
            [self.X, self.Y], cost, name='loc_old_cost')
        new_params = [p - lr * r for p, r in zip(self.params, self.rs)]
        new_cost = safe_clone(cost,
                              self.params,
                              new_params)
        new_err = safe_clone(cost,
                             self.params,
                             new_params)
        self.loc_new_cost = theano.function(
            [self.X, self.Y], [new_cost, new_err], name='loc_new_cost')

        updates = dict(zip(self.params, new_params))
        self.censor_updates(updates)
        self.update_params = theano.function(
            [], [], updates=updates,
            name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        p_norm = TT.scalar('p_norm')
        prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \
        #        TT.sqrt(sum(TT.sum(r*r) for r in self.rs))
        dist = -lr * prod
        angle = prod / p_norm
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function(
            [old_cost, new_cost, p_norm], [rho, dist, angle],
            name='compute_rho')
Beispiel #7
0
    def __init__(self,
                 X,
                 Y,
                 dbm,
                 cost,
                 batchsize=200,
                 init_damp=5.,
                 min_damp=.001,
                 damp_ratio=5. / 4.,
                 mrtol=1e-4,
                 miters=100,
                 trancond=1e-4,
                 lr=.1,
                 adapt_rho=1):
        """
        X: theano design matrix of inputs
        Y: theano design matrix of features
        batchsize: int, describing the batch size
        init_damp: float, initial damping value
        min_damp: float, minimal damping value allowed
        damp_ratio: float, ratio used to increase damping (we decrease by
                    1./ratio)
        mrtol: float, relative tolerance error for the inversion of the metric
        miters: int, maximal number of iteration for minres
        trancond: float, (ignore) threshold for switching from MinresQLP to Minres
        lr : float/shared variable; learning rate
        adapt_rho : 0 or 1, if the damping should be heuristically adapted
        """
        self.batchsize = batchsize
        self.adapt_rho = adapt_rho
        self.damp_ratio = damp_ratio
        self.min_damp = min_damp

        self.dbm = dbm
        self.cost = cost

        self.X = X
        self.Y = Y

        descr = self.cost.get_fixed_var_descr(self.dbm, X, Y)

        self._on_load_batch = descr.on_load_batch[0]

        self.drop_mask = descr.fixed_vars['drop_mask']
        self.drop_mask_Y = descr.fixed_vars['drop_mask_Y']
        self.params = self.get_params()
        self.params_shape = [
            x.get_value(borrow=True).shape for x in self.params
        ]

        self.damping = theano.shared(numpy.float32(init_damp))

        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in self.params_shape
        ]
        self.rs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in self.params_shape
        ]

        cost = self.get_cost()
        gs = TT.grad(cost, self.params)
        self.loc_grad_fn = theano.function([self.X, self.Y], [],
                                           updates=zip(self.gs, gs),
                                           name='loc_fn_grad')

        ### ### ### ### ### ### ###
        self.loc_x = theano.shared(numpy.zeros((20, 784), dtype='float32'))
        self.loc_y = theano.shared(numpy.zeros((20, 10), dtype='float32'))

        def compute_Gv(*args):
            (hid_sig, hid_sftmax) = self.get_hiddens()
            nw_args1 = TT.Lop(
                hid_sig, self.params,
                TT.Rop(hid_sig, self.params, args) /
                ((1 - hid_sig) * hid_sig * self.batchsize))
            nw_args2 = TT.Lop(
                hid_sftmax, self.params,
                TT.Rop(hid_sftmax, self.params, args) /
                (hid_sftmax * self.batchsize))
            fin_vals = [x + y for x, y in zip(nw_args1, nw_args2)]
            new_vals = safe_clone(fin_vals, [self.X, self.Y],
                                  [self.loc_x, self.loc_y])
            return new_vals, {}

        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        self.msgs = minresQLP_messages[1:]
        rvals = minresQLP(compute_Gv, [x / norm_grads for x in self.gs],
                          self.params_shape,
                          rtol=mrtol,
                          damp=self.damping,
                          maxit=miters,
                          TranCond=trancond)
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = TT.cast(rvals[1], 'int32')
        niters = rvals[2]
        rel_residual = rvals[3]
        Anorm = rvals[4]
        Acond = rvals[5]

        norm_rs_grads = TT.sqrt(sum(TT.sum(x**2) for x in nw_rs))
        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))
        updates = zip(self.rs, nw_rs)
        self.compute_natural_gradients = theano.function(
            [], [
                flag, niters, rel_residual, Anorm, Acond, norm_grads,
                norm_rs_grads, norm_ord0
            ],
            updates=updates,
            allow_input_downcast=True,
            name='compute_riemannian_gradients',
            on_unused_input='warn')

        self.loc_old_cost = theano.function([self.X, self.Y],
                                            cost,
                                            name='loc_old_cost')
        new_params = [p - lr * r for p, r in zip(self.params, self.rs)]
        new_cost = safe_clone(cost, self.params, new_params)
        new_err = safe_clone(cost, self.params, new_params)
        self.loc_new_cost = theano.function([self.X, self.Y],
                                            [new_cost, new_err],
                                            name='loc_new_cost')

        updates = dict(zip(self.params, new_params))
        self.censor_updates(updates)
        self.update_params = theano.function([], [],
                                             updates=updates,
                                             name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        p_norm = TT.scalar('p_norm')
        prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \
        #        TT.sqrt(sum(TT.sum(r*r) for r in self.rs))
        dist = -lr * prod
        angle = prod / p_norm
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function([old_cost, new_cost, p_norm],
                                           [rho, dist, angle],
                                           name='compute_rho')