Beispiel #1
0
def test_2():
    h = 1
    a = -10
    b = -a
    n = 2 * b // h + 1
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    v = a
    for k in xrange(n):
        A[k, k] = v
        v += h
    b = numpy.ones((n, ), dtype=theano.config.floatX)
    rtol = numpy.asarray(1e-6, theano.config.floatX)
    maxxnorm = 1e8
    maxit = 50
    tA = theano.shared(A.astype(theano.config.floatX))
    tb = theano.shared(b.astype(theano.config.floatX))
    compute_Av = lambda x: ([TT.dot(tA, x)], {})
    xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \
            minres.minres(compute_Av,
                   [tb],
                   rtol=rtol,
                   maxit=maxit,
                   maxxnorm=maxxnorm,
                   profile=0)

    func = theano.function(
        [],
        xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm],
        name='func',
        profile=0,
        updates=updates,
        mode=theano.Mode(linker='cvm'))
    rvals = func()
    print 'flag', rvals[1]
    print minres.messages[int(rvals[1])]
    print 'iters', rvals[2]
    print 'relres', rvals[3]
    print 'relAres', rvals[4]
    print 'Anorm', rvals[5]
    print 'Acond', rvals[6]
    print 'xnorm', rvals[7]
    print 'Axnorm', rvals[8]
    print rvals[0]
Beispiel #2
0
def test_1():
    n = 100
    on = numpy.ones((n, 1), dtype=theano.config.floatX)
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    for k in xrange(n):
        A[k, k] = 4.
        if k > 0:
            A[k - 1, k] = -2.
            A[k, k - 1] = -2.
    b = A.sum(axis=1)
    rtol = numpy.asarray(1e-10, dtype=theano.config.floatX)
    maxit = 50
    M = numpy.ones((n, ), dtype=theano.config.floatX) * 4.
    tA = theano.shared(A.astype(theano.config.floatX))
    tb = theano.shared(b.astype(theano.config.floatX))
    tM = theano.shared(M.astype(theano.config.floatX))
    compute_Av = lambda x: ([TT.dot(tA, x)], {})
    xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \
            minres.minres(compute_Av,
                   [tb],
                   rtol=rtol,
                   maxit=maxit,
                   Ms=[tM],
                   profile=0)

    func = theano.function(
        [],
        xs + [flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm],
        name='func',
        profile=0,
        updates=updates,
        mode=theano.Mode(linker='cvm'))
    rvals = func()
    print 'flag', rvals[1]
    print minres.messages[int(rvals[1])]
    print 'iters', rvals[2]
    print 'relres', rvals[3]
    print 'relAres', rvals[4]
    print 'Anorm', rvals[5]
    print 'Acond', rvals[6]
    print 'xnorm', rvals[7]
    print 'Axnorm', rvals[8]
    print 'error', numpy.sqrt(numpy.sum((numpy.dot(rvals[0], A) - b)**2))
    print
Beispiel #3
0
def test_1():
    n = 100
    on = numpy.ones((n, 1), dtype=theano.config.floatX)
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    for k in xrange(n):
        A[k, k] = 4.
        if k > 0:
            A[k - 1, k] = -2.
            A[k, k - 1] = -2.
    b = A.sum(axis=1)
    rtol = numpy.asarray(1e-10, dtype=theano.config.floatX)
    maxit = 50
    M = numpy.ones((n,), dtype=theano.config.floatX) * 4.
    tA = theano.shared(A.astype(theano.config.floatX))
    tb = theano.shared(b.astype(theano.config.floatX))
    tM = theano.shared(M.astype(theano.config.floatX))
    compute_Av = lambda x: ([TT.dot(tA, x)], {})
    xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \
            minres.minres(compute_Av,
                   [tb],
                   rtol=rtol,
                   maxit=maxit,
                   Ms=[tM],
                   profile=0)

    func = theano.function([],
                           xs + [flag, iters, relres, relAres, Anorm, Acond,
                                 xnorm, Axnorm],
                          name='func',
                          profile=0,
                          updates = updates,
                          mode=theano.Mode(linker='cvm'))
    rvals = func()
    print 'flag', rvals[1]
    print minres.messages[int(rvals[1])]
    print 'iters', rvals[2]
    print 'relres', rvals[3]
    print 'relAres', rvals[4]
    print 'Anorm', rvals[5]
    print 'Acond', rvals[6]
    print 'xnorm', rvals[7]
    print 'Axnorm', rvals[8]
    print 'error', numpy.sqrt(numpy.sum((numpy.dot(rvals[0], A) - b) ** 2))
    print
Beispiel #4
0
def test_2():
    h = 1
    a = -10
    b = -a
    n = 2 * b // h + 1
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    A = numpy.zeros((n, n), dtype=theano.config.floatX)
    v = a
    for k in xrange(n):
        A[k, k] = v
        v += h
    b = numpy.ones((n,), dtype=theano.config.floatX)
    rtol = numpy.asarray(1e-6, theano.config.floatX)
    maxxnorm = 1e8
    maxit = 50
    tA = theano.shared(A.astype(theano.config.floatX))
    tb = theano.shared(b.astype(theano.config.floatX))
    compute_Av = lambda x: ([TT.dot(tA, x)], {})
    xs, flag, iters, relres, relAres, Anorm, Acond, xnorm, Axnorm, updates = \
            minres.minres(compute_Av,
                   [tb],
                   rtol=rtol,
                   maxit=maxit,
                   maxxnorm=maxxnorm,
                   profile=0)

    func = theano.function([],
                           xs + [flag, iters, relres, relAres, Anorm, Acond,
                                 xnorm, Axnorm],
                          name='func',
                          profile=0,
                          updates = updates,
                          mode=theano.Mode(linker='cvm'))
    rvals = func()
    print 'flag', rvals[1]
    print minres.messages[int(rvals[1])]
    print 'iters', rvals[2]
    print 'relres', rvals[3]
    print 'relAres', rvals[4]
    print 'Anorm', rvals[5]
    print 'Acond', rvals[6]
    print 'xnorm', rvals[7]
    print 'Axnorm', rvals[8]
    print rvals[0]
Beispiel #5
0
    def __init__(self,
                 model,
                 state,
                 data):
        """
        Parameters:
            :param model:
                Class describing the model used.  It should provide the
                 computational graph to evaluate the model
            :param state:
                Dictionary containing the current state of your job. This
                includes configuration of the job, specifically the seed,
                the startign damping factor, batch size, etc. See main.py
                for details
            :param data:
                Class describing the dataset used by the model
        """

        #####################################
        # Step 0. Constructs shared variables
        #####################################
        n_params = len(model.params)
        cbs = state['cbs']
        bs = state['bs']
        ebs = state['ebs']
        mbs = state['mbs']
        profile = state['profile']
        self.model = model
        self.rng = numpy.random.RandomState(state['seed'])
        self.damping = theano.shared(numpy.float32(state['damp']))

        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.loop_inps = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.loop_outs = [theano.shared(
            numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape]
        self.step = 0
        self.cbs = cbs
        self.bs = bs
        self.ebs = ebs
        self.mbs = mbs
        self.state = state
        self.profile = profile
        self.data = data
        self.step_timer = time.time()

        ############################################################
        # Step 1. Compile function for computing eucledian gradients
        ############################################################
        print 'Constructing grad function'
        bdx = TT.iscalar('batch_idx')
        loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in
                    self.data.variables]
        cost = safe_clone(model.train_cost, model.inputs, loc_data)
        gs = TT.grad(cost, model.params)
        ratio = numpy.float32(float(bs) / cbs)
        update = [(g, g + lg / ratio) for g, lg in zip(self.gs, gs)]

        print 'Compiling grad function'
        st = time.time()
        self.loc_grad_fn = theano.function(
            [bdx ],
            [],
            updates=update, name='loc_fn_grad',
            profile=profile)
        print 'took', time.time() - st

        #############################################################
        # Step 2. Compile function for Computing Riemannian gradients
        #############################################################
        loc_x = self.data._natgrad[bdx*cbs: (bdx+1)*cbs]
        loc_y = self.data._natgrady[bdx*cbs:(bdx+1)*cbs]
        loc_Gvs = safe_clone(model.Gvs(*self.loop_inps), [model.X, model.Y],
                             [loc_x, loc_y])
        updates = [(l, l + lg) for l, lg in zip(self.loop_outs, loc_Gvs)]
        st = time.time()
        loc_Gv_fn = theano.function(
            [bdx], [], updates=updates, name='loc_fn_rop', profile=profile)
        print 'took', time.time() - st

        def compute_Gv(*args):
            rval = forloop(loc_Gv_fn,
                           mbs // cbs,
                           self.loop_inps,
                           self.loop_outs)(*args)
            return rval, {}

        print 'Constructing riemannian gradient function'
        st = time.time()
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        if not state['minresQLP']:
            self.msgs = minres_messages
            rvals = minres(compute_Gv,
                           [x / norm_grads for x in self.gs],
                           rtol=state['mrtol'],
                           damp=self.damping,
                           maxit=state['miters'],
                           profile=state['profile'])
        else:
            self.msgs = minresQLP_messages[1:]
            rvals = minresQLP(compute_Gv,
                              [x / norm_grads for x in self.gs],
                              model.params_shape,
                              rtol=state['mrtol'],
                              damp=self.damping,
                              maxit=state['miters'],
                              TranCond=state['trancond'],
                              profile=state['profile'])

        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = TT.cast(rvals[1], 'int32')
        niters = rvals[2]
        rel_residual = rvals[3]
        Anorm = rvals[4]
        Acond = rvals[5]

        norm_rs_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in nw_rs))
        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))
        updates = zip(self.rs, nw_rs)
        print 'took', time.time() - st
        print 'Compiling riemannian gradient function'
        st = time.time()
        self.compute_natural_gradients = theano.function(
            [],
            [flag, niters, rel_residual, Anorm, Acond,
             norm_grads, norm_rs_grads, norm_ord0],
            updates=updates,
            allow_input_downcast = True,
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            profile=profile)
        print 'took', time.time() - st
        ###########################################################
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        ###########################################################
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(state['lr'])
        loc_data = [x[bdx * cbs: (bdx + 1) * cbs] for x in
                    self.data.variables]
        old_cost = safe_clone(model.train_cost, model.inputs, loc_data)
        self.loc_old_cost = theano.function(
            [bdx], old_cost, name='loc_old_cost', profile=profile)
        new_params = [p - lr * r for p, r in zip(model.params, self.rs)]
        new_cost = safe_clone(model.train_cost,
                              model.inputs + model.params,
                              loc_data + new_params)
        new_err = safe_clone(model.error,
                             model.inputs + model.params,
                             loc_data + new_params)
        self.loc_new_cost = theano.function(
            [bdx, lr], [new_cost, new_err], name='loc_new_cost',
            profile=profile)

        self.lr = numpy.float32(state['lr'])
        updates = dict(zip(model.params, new_params))
        model.dbm_class.censor_updates(updates)
        self.update_params = theano.function(
            [lr], [], updates=updates,
            name='update_params')
        old_cost = TT.scalar('old_cost')
        new_cost = TT.scalar('new_cost')
        p_norm = TT.scalar('p_norm')
        prod = sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        #pnorm = TT.sqrt(sum(TT.sum(g*g) for g in self.gs)) * \
        #        TT.sqrt(sum(TT.sum(r*r) for r in self.rs))
        dist = -lr * prod
        angle = prod / p_norm
        rho = (new_cost - old_cost) / dist
        self.compute_rho = theano.function(
            [old_cost, new_cost, lr, p_norm], [rho, dist, angle], name='compute_rho', profile=profile)
        self.old_cost = 1e20
        self.__new_cost = 0
        self.__error = 0
        self.return_names = ['cost',
                             'old_cost',
                             'error',
                             'time_grads',
                             'time_metric',
                             'time_eval',
                             'minres_flag',
                             'minres_iters',
                             'minres_relres',
                             'minres_Anorm',
                             'minres_Acond',
                             'norm_ord0',
                             'norm_grad',
                             'norm_nat',
                             'lr',
                             'grad_angle',
                             #'r_g',
                             #'icost',
                             'damping',
                             'rho'
                            ]
Beispiel #6
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']],
                               name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
        else:
            # Store eucledian gradients
            self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode=gpu_mode
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(args, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(cgv, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}



        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            rtol=options['mrtol'],
            shift= -options['mreg'],
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            final_cost,
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
Beispiel #7
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              Ms=self.js,
                              rtol=options['mrtol'],
                              shift=self.damping,
                              maxit=options['miters'],
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)])
        norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)), beta_k)

        nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0, beta_k
            ],
            updates=updates,
            allow_input_downcast=True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [-r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates=dict(
                                                 zip(model.params, newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function(
            [],
            updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])),
            name='reset_dirs',
            on_unused_input='warn',
            mode=cpu_mode,
            allow_input_downcast=True,
            profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function([lr, ebdx],
                                          fcost,
                                          givens=grad_inps,
                                          allow_input_downcast=True,
                                          name='ls_cost_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.approx_change = theano.function(
            [lr],
            -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]),
            allow_input_downcast=True,
            name='approx_change',
            mode=gpu_mode,
            profile=options['profile'])

        self.ls_grad_fn = theano.function([lr, ebdx],
                                          fgrad,
                                          allow_input_downcast=True,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=cpu_mode,
                                             allow_input_downcast=True,
                                             on_unused_input='warn',
                                             profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Beispiel #8
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'],
                              name='xdata')
        ydata = theano.shared(data['train_y'],
                           name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store riemannian gradients
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store jacobi diagonal
        self.js = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]


        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        self.damping = theano.shared(numpy.float32(options['mreg']))
        mode=gpu_mode
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates


        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift= self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc0,acc1):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_cost2 = safe_clone(model.train_cost, replace =
                                  dict(zip(model.inputs, nw_inps)))
            return [_idx + const(1),
                    acc0 + nw_cost,
                    acc1 + nw_cost2]

        acc0 = const([0])
        acc1 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0, acc1],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        cost0 = rvals[2].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]

        denom = -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.rs)])
        rho = (final_cost - cost0) / denom
        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            [final_cost, rho],
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])


        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
Beispiel #9
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out+eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates
        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)])
        norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)),
                           beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k),
                                         TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)),
                           beta_k)

        nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0,
             beta_k],
            updates=updates,
            allow_input_downcast = True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [ -r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function([],
                                                updates=dict(zip(self.ds +
                                                                 [self.norm_d],
                                                                 nw_ds +
                                                                 [nw_normd])),
                                                name='reset_dirs',
                                                on_unused_input='warn',
                                                mode=cpu_mode,
                                                allow_input_downcast=True,
                                                profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function(
            [lr, ebdx],
            fcost,
            givens = grad_inps,
            allow_input_downcast=True,
            name='ls_cost_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.approx_change = theano.function(
                [lr],
                -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]),
                allow_input_downcast=True,
                name='approx_change',
                mode=gpu_mode,
                profile=options['profile'])


        self.ls_grad_fn = theano.function(
            [lr, ebdx],
            fgrad,
            allow_input_downcast=True,
            givens = grad_inps,
            name='ls_grad_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=cpu_mode,
                           allow_input_downcast=True,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Beispiel #10
0
    def init_cpu(self, options, channel, data, model):
        n_params = len(self.model.params)
        # Step 1. Compile function for computing eucledian gradients
        self.reset_gradients = theano.function(
            [],
            [],
            updates = zip(self.gs, [TT.zeros_like(g) for g in self.gs]),
            on_unused_input='warn',
            mode=cpu_mode,
            name='reset_gradients',
            profile=options['profile'])

        gbdx = TT.iscalar('grad_batch_idx')
        comp_grad = TT.iscalar('comp_grad')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))
        cst = time.time()
        def grad_step(*args):

            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)]
            _gs = [x for x in gs]
            _nw_gs = [gpu_from_host(g) for g in nw_gs]
            nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True)
            nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)]
            return [args[0] + const(1), args[1] + nw_cost] + nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        cost0 = TT.unbroadcast(const([0]),0)
        n_steps = TT.iscalar('nsteps')
        rvals, updates = scan(grad_step,
                              states=[idx0, cost0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / TT.cast(n_steps, 'float32') for x in rvals[2: 2 + n_params]]
        nw_gs = [og + nwg for og, nwg in zip(self.gs, nw_gs)]
        fcost = rvals[1][0] / TT.cast(n_steps, 'float32')
        updates.update(dict(zip(self.gs, nw_gs)))

        grad_inps = zip(loc_inputs, self.shared_data)
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx, comp_grad, n_steps],
            fcost,
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])
        print 'Time to compile grad', print_time(time.time() - cst)
        cst = time.time()
        def jacob_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']:(idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            mode=cpu_mode
            params = model.cpu_params
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):

                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                    denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1:1+n_params], params)]
            return [args[0] + const(1)] + nw_js

        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['mbs'] // options['cbs']
        mode = cpu_mode
        rvals, updates = scan(jacob_step,
                              states=[idx0] + ij,
                              n_steps=n_steps,
                              name='jacob_loop',
                              mode=mode,
                              profile=options['profile'])

        nw_js = [x[0] for x in rvals[1:1+n_params]]
        updates.update(dict(zip(self.js, nw_js)))
        grad_inps = [(x, y[gbdx*options['mbs']:(gbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs[:1], self.cpu_shared_data[:1])]

        print 'Compiling grad function'
        self.compute_jacobi_preconditioner = theano.function(
            [gbdx],
            [],
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='jacobi_preconditioner_gradients',
            mode=mode,
            profile=options['profile'])
        print 'Time compile jacobi ', print_time(time.time() - cst)
        cst = time.time()
        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        mode = cpu_mode
        def compute_Gv(*args):
            cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                 name ='cgv%d'%idx)
                       for idx, shp in enumerate(model.params_shape)]
            print_mem('allocated mem for cgv')
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=gpu_mode,
                                  name='Gv_step',
                                  profile=options['profile'])
            final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
            grad_inps = zip(loc_inputs, self.shared_data)
            loc_fn = theano.function([],
                                     final_Gvs,
                                     updates = updates,
                                     givens = dict(grad_inps),
                                     on_unused_input='warn',
                                     mode=gpu_mode,
                                     name='loc_fn',
                                     profile = options['profile'])
            fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

            return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        mreg = TT.scalar('mreg')
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift = - mreg,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [mreg],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        nw_ps = [p - lr * r for p, r in zip(model.cpu_params, self.rs)]
        nw_ds = [ -r for r in self.rs]

        self.update_cparams = theano.function(
            [lr], updates = dict(zip(model.cpu_params, nw_ps)),
            name='update_cparam',
            allow_input_downcast=True,
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        newparams = [y.type.filter_variable(x) for x,y in zip(nw_ps,
                                                              model.params)]
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=cpu_mode,
                                             profile=options['profile'])
        self.scalar_grad = theano.function(
            [],
            sum(TT.sum(x*y) for x,y in zip(self.gs, self.ds)),
            name='scalar_grad',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        nsteps = self.options['ebs'] // self.options['cbs']
        self.current_alpha = numpy.inf
        def ls_cost(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            return self.compute_eucledian_gradients(pos, 0, nsteps)
        self.ls_cost_fn = ls_cost

        def ls_grad(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            self.reset_gradients()
            self.compute_eucledian_gradients(pos, 1, nsteps)
            return self.scalar_grad()
        self.ls_grad_fn = ls_grad

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            nw_cost = \
                  TT.cast(safe_clone(model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                           ferr,
                           givens=dict(zip(loc_inputs, self.cpu_shared_data)),
                           name='compute_err',
                           mode=cpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)

        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Beispiel #11
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients
        self.rs1 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        self.rs2 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store jacobi diagonal
        self.js = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape, nstreams=128))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        self.damping = theano.shared(numpy.float32(options['mreg']))
        # Step 2.1 Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gf_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gf_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs1, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients1 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 2.2 Compile function for Computing Riemannian gradients
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gc_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gc_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs2, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients2 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        if options['rsch'] == 1:
            self.rs = self.rs1
        else:
            self.rs = self.rs2

        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]
        denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        self.approx_change = theano.function([lr],
                                             denom,
                                             name='approx_change',
                                             mode=gpu_mode,
                                             allow_input_downcast=True,
                                             profile=options['profile'])

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       allow_input_downcast=True,
                                       mode=gpu_mode,
                                       profile=options['profile'])

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        self.grad_lr_fn = theano.function([ebdx, lr],
                                          fgrad,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          on_unused_input='warn',
                                          mode=gpu_mode,
                                          allow_input_downcast=True,
                                          profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Beispiel #12
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            # Store eucledian gradients
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              rtol=options['mrtol'],
                              shift=-options['mreg'],
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])