Python scan Examples

Programming Language: Python

Namespace/Package Name: theano.sandbox.scan

Method/Function: scan

Examples at hotexamples.com: 53

Python scan - 53 examples found. These are the top rated real world Python examples of theano.sandbox.scan.scan extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: dbm.py Project: gdesjardins/DBM

    def e_step(self, n_steps=100, eps=1e-5):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples]

        # now alternate mean-field inference for even/odd layers
        def mf_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1,self.depth,2):
                new_psamples[i] = self.hi_given(psamples, i)
            for i in xrange(2,self.depth,2):
                new_psamples[i] = self.hi_given(psamples, i)

            score = 0.
            for i in xrange(1, self.depth):
                score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score)

            return new_psamples, theano.scan_module.until(score < eps)

        new_psamples, updates = scan(
                mf_iteration,
                states = new_psamples,
                n_steps=n_steps)

        return [x[0] for x in new_psamples]

Example #2

Show file

    def pos_sampling(self, n_steps=50):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [
            T.unbroadcast(T.shape_padleft(psample))
            for psample in self.psamples
        ]

        # now alternate mean-field inference for even/odd layers
        def sample_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1, self.depth, 2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            for i in xrange(2, self.depth, 2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            return new_psamples

        new_psamples, updates = scan(sample_iteration,
                                     states=new_psamples,
                                     n_steps=n_steps)

        return [x[0] for x in new_psamples]

Example #3

Show file

File: dbm_metrics.py Project: yosinski/pylearn2

def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5):
    """
    Performs 'n_steps' of mean-field inference (used to compute positive phase
    statistics)

    Parameters
    ----------
    psamples : array-like object of theano shared variables
        State of each layer of the DBM (during the inference process).
        psamples[0] points to the input
    n_steps :  integer
        Number of iterations of mean-field to perform
    """
    depth = len(psamples)

    new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in psamples]

    # now alternate mean-field inference for even/odd layers
    def mf_iteration(*psamples):
        new_psamples = [p for p in psamples]
        for i in xrange(1, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)
        for i in xrange(2, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)

        score = 0.0
        for i in xrange(1, depth):
            score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score)

        return new_psamples, theano.scan_module.until(score < eps)

    new_psamples, updates = scan(mf_iteration, states=new_psamples, n_steps=n_steps)

    return [x[0] for x in new_psamples]

Example #4

Show file

File: natSGD.py Project: cc13ny/galatea

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(args, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates

Example #5

Show file

    def e_step(self, n_steps=100, eps=1e-5):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [
            T.unbroadcast(T.shape_padleft(psample))
            for psample in self.psamples
        ]

        # now alternate mean-field inference for even/odd layers
        def mf_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1, self.depth, 2):
                new_psamples[i] = self.hi_given(psamples, i)
            for i in xrange(2, self.depth, 2):
                new_psamples[i] = self.hi_given(psamples, i)

            score = 0.
            for i in xrange(1, self.depth):
                score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])),
                                  score)

            return new_psamples, theano.scan_module.until(score < eps)

        new_psamples, updates = scan(mf_iteration,
                                     states=new_psamples,
                                     n_steps=n_steps)

        return [x[0] for x in new_psamples]

Example #6

Show file

File: krylov_lbfgs.py Project: cc13ny/galatea

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates

Example #7

Show file

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates

Example #8

Show file

File: implicit_hossrbm_v05_2.py Project: gdesjardins/hossrbm

    def pos_phase(self, v, init_state, n_steps=1, eps=1e-3):
        """
        Mixed mean-field + sampling inference in positive phase.
        :param v: input being conditioned on
        :param init: dictionary of initial values
        :param n_steps: number of Gibbs updates to perform afterwards.
        """
        def pos_mf_iteration(g1, h1, v, pos_counter):
            h2 = self.h_hat(g1, v)
            s2_1 = self.s1_hat(g1, v)
            s2_0 = self.s0_hat(g1, v)
            g2 = self.g_hat(h2, s2_1, s2_0)
            # stopping criterion
            dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v)))
            dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v)))
            stop = T.maximum(dl_dghat, dl_dhhat)
            return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)

        states = [T.unbroadcast(T.shape_padleft(init_state['g'])),
                  T.unbroadcast(T.shape_padleft(init_state['h'])),
                  {'steps': 1},
                  {'steps': 1},
                  T.unbroadcast(T.shape_padleft(v)),
                  T.unbroadcast(T.shape_padleft(0.))]

        rvals, updates = scan(
                pos_mf_iteration,
                states = states,
                n_steps=n_steps)

        return [rval[0] for rval in rvals]

Example #9

Show file

def scalar_armijo_search(phi,
                         phi0,
                         derphi0,
                         c1=constant(1e-4),
                         n_iters=10,
                         profile=0):
    """
    .. todo::

        WRITEME
    """
    alpha0 = one
    phi_a0 = phi(alpha0)
    alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\
            (phi_a0 - phi0 - derphi0 * alpha0)
    phi_a1 = phi(alpha1)

    csol1 = phi_a0 <= phi0 + c1 * derphi0
    csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0

    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                          one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)

    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    # print 'armijo'
    rvals, _ = scan(armijo,
                    states=states,
                    n_steps=n_iters,
                    name='armijo',
                    mode=theano.Mode(linker='cvm'),
                    profile=profile)

    sol_scan = rvals[1][0]
    a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan))
    score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0]))
    return a_opt, score

Example #10

Show file

File: linesearch.py Project: SuperElectric/pylearn2

def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4),
                         n_iters=10, profile=0):
    alpha0 = one
    phi_a0 = phi(alpha0)
    alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\
            (phi_a0 - phi0 - derphi0 * alpha0)
    phi_a1 = phi(alpha1)

    csol1 = phi_a0 <= phi0 + c1 * derphi0
    csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0

    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(
            TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(
            TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                  one - alpha2 / alpha1 < 0.96),
            alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)

    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    # print 'armijo'
    rvals, _ = scan(
                armijo,
                states=states,
                n_steps=n_iters,
                name='armijo',
                mode=theano.Mode(linker='cvm'),
                profile=profile)

    sol_scan = rvals[1][0]
    a_opt = ifelse(csol1, one,
                ifelse(csol2, alpha1,
                    sol_scan))
    score = ifelse(csol1, phi_a0,
                   ifelse(csol2, phi_a1,
                          rvals[2][0]))
    return a_opt, score

Example #11

Show file

File: test_scan.py Project: johnarevalo/Theano

def test_005():
    sq = theano.tensor.fvector('sq')
    nst = theano.tensor.iscalar('nst')
    out, _ = scan.scan(lambda s: s+numpy.float32(1),
                       sequences=sq,
                       states=[None],
                       n_steps=nst)
    fn = theano.function([sq, nst], out)
    val_sq = numpy.float32([1, 2, 3, 4, 5])
    assert numpy.all(fn(val_sq, 5) == val_sq + 1)

Example #12

Show file

File: test_scan.py Project: johnarevalo/Theano

def test_001():
    x0 = theano.tensor.fvector('x0')
    state = theano.tensor.unbroadcast(
        theano.tensor.shape_padleft(x0), 0)
    out, _ = scan.scan(lambda x: x+numpy.float32(1),
                           states=state,
                           n_steps=5)
    fn = theano.function([x0], out[0])
    val_x0 = numpy.float32([1, 2, 3])
    assert numpy.all(fn(val_x0) == val_x0 + 5)

Example #13

Show file

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               cgv))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

Example #14

Show file

File: lincg.py Project: tony32769/DBM

def linear_cg_fletcher_reeves(compute_Ax,
                              bs,
                              xinit=None,
                              rtol=1e-6,
                              maxiter=1000,
                              damp=0,
                              floatX=None,
                              profile=0):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)

    def loop(rz_old, *args):
        ps = args[:n_params]
        rs = args[n_params:2 * n_params]
        xs = args[2 * n_params:]
        _Aps = compute_Ax(*ps)
        Aps = [x + damp * y for x, y in zip(_Aps, ps)]
        alpha = rz_old / sum((x * y).sum() for x, y in zip(Aps, ps))
        xs = [x + alpha * p for x, p in zip(xs, ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        rz_new = sum((r * r).sum() for r in rs)
        ps = [r + rz_new / rz_old * p for r, p in zip(rs, ps)]
        return [rz_new]+ps+rs+xs, \
                theano.scan_module.until(abs(rz_new) < rtol)

    if xinit is None:
        r0s = bs
        _x0s = [
            tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)))
            for x in bs
        ]
    else:
        init_Ax = compute_Ax(*xinit)
        r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit]

    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    rz_old = sum((r * r).sum() for r in r0s)
    _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old), 0)
    outs, updates = scan(loop,
                         states=[_rz_old] + _p0s + _r0s + _x0s,
                         n_steps=maxiter,
                         mode=theano.Mode(linker='cvm'),
                         name='linear_conjugate_gradient',
                         profile=profile)
    fxs = outs[1 + 2 * n_params:]
    return [x[0] for x in fxs]

Example #15

Show file

File: test_scan.py Project: johnarevalo/Theano

def test_002():
    x0 = theano.tensor.fvector('x0')
    state = theano.tensor.alloc(
        theano.tensor.constant(numpy.float32(0)),
        6,
        x0.shape[0])
    state = theano.tensor.set_subtensor(state[0], x0)

    out, _ = scan.scan(lambda x: x+numpy.float32(1),
                           states=state,
                           n_steps=5)
    fn = theano.function([x0], out)
    val_x0 = numpy.float32([1, 2, 3])
    assert numpy.all(fn(val_x0)[-1] == val_x0 + 5)
    assert numpy.all(fn(val_x0)[0] == val_x0)

Example #16

Show file

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

Example #17

Show file

File: lincg.py Project: quolc/theano_optimize

def linear_cg_precond(compute_Gv,
                      bs,
                      Msz,
                      rtol=1e-16,
                      maxit=100000,
                      floatX=None):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)

    def loop(rsold, *args):
        ps = args[:n_params]
        rs = args[n_params:2 * n_params]
        xs = args[2 * n_params:]
        Aps = compute_Gv(*ps)
        alpha = rsold / sum((x * y).sum() for x, y in zip(Aps, ps))
        xs = [x + alpha * p for x, p in zip(xs, ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        zs = [r / z for r, z in zip(rs, Msz)]
        rsnew = sum((r * z).sum() for r, z in zip(rs, zs))
        ps = [z + rsnew / rsold * p for z, p in zip(zs, ps)]
        return [rsnew] + ps + rs + xs,

    theano.scan_module.until(abs(rsnew) < rtol)

    r0s = bs
    _p0s = [
        tensor.unbroadcast(tensor.shape_padleft(x / z), 0)
        for x, z in zip(r0s, Msz)
    ]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    _x0s = [
        tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)), 0)
        for x in bs
    ]
    rsold = sum((r * r / z).sum() for r, z in zip(r0s, Msz))
    _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold), 0)
    outs, updates = scan(loop,
                         states=[_rsold] + _p0s + _r0s + _x0s,
                         n_steps=maxit,
                         mode=theano.Mode(linker='c|py'),
                         name='linear_conjugate_gradient',
                         profile=0)
    fxs = outs[1 + 2 * n_params:]
    return [x[0] for x in fxs]

Example #18

Show file

File: krylov_lbfgs.py Project: cc13ny/galatea

            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, cgv))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

Example #19

Show file

File: lincg.py Project: gdesjardins/MFNG

def linear_cg_fletcher_reeves(compute_Ax, bs, xinit = None,
              rtol = 1e-6, maxiter = 1000, damp=0,
              floatX = None, profile=0):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)


    def loop(rz_old, *args):
        ps = args[:n_params]
        rs = args[n_params:2*n_params]
        xs = args[2*n_params:]
        _Aps = compute_Ax(*ps)
        Aps = [x + damp*y for x,y in zip(_Aps, ps)]
        alpha = rz_old/sum( (x*y).sum() for x,y in zip(Aps, ps))
        xs = [x + alpha * p for x,p in zip(xs,ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        rz_new = sum( (r*r).sum() for r in rs)
        ps = [ r + rz_new/rz_old*p for r,p in zip(rs,ps)]
        return [rz_new]+ps+rs+xs, \
                theano.scan_module.until(abs(rz_new) < rtol)

    if xinit is None:
        r0s = bs
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x))) for x in bs]
    else:
        init_Ax = compute_Ax(*xinit)
        r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit]

    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    rz_old = sum( (r*r).sum() for r in r0s)
    _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old),0)
    outs, updates = scan(loop,
                         states = [_rz_old] + _p0s + _r0s + _x0s,
                         n_steps = maxiter,
                         mode = theano.Mode(linker='cvm'),
                         name = 'linear_conjugate_gradient',
                         profile=profile)
    fxs = outs[1+2*n_params:]
    return [x[0] for x in fxs]

Example #20

Show file

File: test_scan.py Project: johnarevalo/Theano

def test_003():
    x0 = theano.tensor.fvector('x0')
    sq = theano.tensor.fvector('sq')
    state = theano.tensor.alloc(
        theano.tensor.constant(numpy.float32(0)),
        6,
        x0.shape[0])
    state = theano.tensor.set_subtensor(state[0], x0)

    out, _ = scan.scan(lambda s, x: x+s,
                           sequences=sq,
                           states=state,
                           n_steps=5)
    fn = theano.function([sq, x0], out)
    val_x0 = numpy.float32([1, 2, 3])
    val_sq = numpy.float32([1, 2, 3, 4, 5])
    assert numpy.all(fn(val_sq, val_x0)[-1] == val_x0 + 15)
    assert numpy.all(fn(val_sq, val_x0)[0] == val_x0)

Example #21

Show file

File: dbm_metrics.py Project: jacoblsmith/pylearn2

def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5):
    """
    Performs 'n_steps' of mean-field inference (used to compute positive phase
    statistics)

    Parameters
    ----------
    psamples : array-like object of theano shared variables
        State of each layer of the DBM (during the inference process).
        psamples[0] points to the input
    n_steps :  integer
        Number of iterations of mean-field to perform
    """
    depth = len(psamples)

    new_psamples = [T.unbroadcast(T.shape_padleft(psample))
                    for psample in psamples]

    # now alternate mean-field inference for even/odd layers
    def mf_iteration(*psamples):
        new_psamples = [p for p in psamples]
        for i in xrange(1, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)
        for i in xrange(2, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)

        score = 0.
        for i in xrange(1, depth):
            score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])),
                              score)

        return new_psamples, theano.scan_module.until(score < eps)

    new_psamples, updates = scan(
        mf_iteration,
        states=new_psamples,
        n_steps=n_steps
    )

    return [x[0] for x in new_psamples]

Example #22

Show file

File: dbm.py Project: gdesjardins/DBM

    def pos_sampling(self, n_steps=50):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples]

        # now alternate mean-field inference for even/odd layers
        def sample_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1,self.depth,2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            for i in xrange(2,self.depth,2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            return new_psamples

        new_psamples, updates = scan(
                sample_iteration,
                states = new_psamples,
                n_steps=n_steps)

        return [x[0] for x in new_psamples]

Example #23

Show file

File: lincg.py Project: LeeEdel/theano_optimize

def linear_cg_precond(compute_Gv, bs, Msz, rtol = 1e-16, maxit = 100000, floatX = None):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)
    def loop(rsold, *args):
        ps = args[:n_params]
        rs = args[n_params:2*n_params]
        xs = args[2*n_params:]
        Aps = compute_Gv(*ps)
        alpha = rsold/sum( (x*y).sum() for x,y in zip(Aps, ps))
        xs = [x + alpha * p for x,p in zip(xs,ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        zs = [ r/z for r,z in zip(rs, Msz)]
        rsnew = sum( (r*z).sum() for r,z in zip(rs,zs))
        ps = [ z + rsnew/rsold*p for z,p in zip(zs,ps)]
        return [rsnew]+ps+rs+xs, 
    theano.scan_module.until(abs(rsnew) < rtol)

    r0s = bs
    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x/z),0) for x,z in zip(r0s, Msz)]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    _x0s = [tensor.unbroadcast(tensor.shape_padleft(
        tensor.zeros_like(x)),0) for x in bs]
    rsold = sum( (r*r/z).sum() for r,z in zip(r0s, Msz))
    _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold),0)
    outs, updates = scan(loop,
                         states = [_rsold] + _p0s + _r0s + _x0s,
                         n_steps = maxit,
                         mode = theano.Mode(linker='c|py'),
                         name = 'linear_conjugate_gradient',
                         profile=0)
    fxs = outs[1+2*n_params:]
    return [x[0] for x in fxs]

Example #24

Show file

File: RNN_Enc_Dec_Phrase.py Project: JiangNanDeXue/GroundHog

def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000 
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(
        rng,
        n_in=state['nins'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_%d'%si))
        if state['rec_gating']:
            gater_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='gater_words_%d'%si))
        if state['rec_reseting']:
            reseter_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='reseter_words_%d'%si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_reseter_%d'%si))

        add_rec_step.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_%d'%si))

    def _add_op(words_embeddings, 
                words_mask=None,
                prev_val=None,
                si = 0, 
                state_below = None,
                gater_below = None,
                reseter_below = None,
                one_step=False, 
                bs=1, 
                init_state=None, 
                use_noise=True):
        seqlen = words_embeddings.out.shape[0]//bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si-1](state_below, one_step=one_step, 
                    use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
            
        if not one_step:
            rval= add_rec_step[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        else:
            rval= add_rec_step[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        return rval
    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(
        rng,
        n_in=state['nouts'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_t_%d'%si))
        if state['rec_gating']:
            gater_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='gater_words_t_%d'%si))
        if state['rec_reseting']:
            reseter_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='reseter_words_t_%d'%si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='proj_everything_t_%d'%si,
            learn_bias = False))
        if state['rec_gating']:
            gater_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='gater_everything_t_%d'%si,
                learn_bias = False))
        if state['rec_reseting']:
            reseter_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='reseter_everything_t_%d'%si,
                learn_bias = False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_t_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_reseter_%d'%si))

        add_rec_step_t.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_t_%d'%si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim'] * state['maxout_part']],
                activation=['lambda x: x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='encoder_proj_%d'%si,
                learn_bias = (si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), 
                indim = indim, pieces = pieces, rng=rng)

    def _add_t_op(words_embeddings, everything = None, words_mask=None,
                prev_val=None,one_step=False, bs=1, 
                init_state=None, use_noise=True,
                gater_below = None,
                reseter_below = None,
                si = 0, state_below = None):
        seqlen = words_embeddings.out.shape[0]//bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si-1](state_below, 
                    one_step=one_step, use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                one_step=one_step,
                init_state=init_state,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        else:
            rval = add_rec_step_t[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                one_step=one_step,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        return rval
    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation = [state['activ']],
                bias_scale = [state['bias']],
                scale=state['weight_scale'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                name='bias_code_%d'%si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(
            rng,
            n_in=word_code_nin,
            n_hids=[outdim],
            activation = 'lambda x:x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            learn_bias = False,
            name='word_code')

    proj_code = MultiLayer(
        rng,
        n_in=state['dim'],
        n_hids=[outdim],
        activation = 'lambda x: x',
        bias_scale = [state['bias_mlp']/3],
        scale=state['weight_scale'],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        learn_bias = False,
        name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[outdim],
            activation = 'lambda x: x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            name='proj_h_%d'%si))

    if state['bigram']:
        proj_word = MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[outdim],
            activation=['lambda x:x'],
            bias_scale = [state['bias_mlp']/3],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            learn_bias = False,
            name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(
            rng,
            indim, 
            state['nouts'],
            state['weight_scale'],
            -1, 
            rank_n_approx = rank_n_approx,
            rank_n_activ = rank_n_activ,
            weight_noise=state['weight_noise'],
            init_fn=state['weight_init_fn'],
            name='out')

    def _pop_op(everything, accum, everything_max = None,
            everything_min = None, word = None, aword = None,
            one_step=False, use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1,state['decoder_stack']):
            rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise), 
                        one_step=one_step, use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1], outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, 
        bs=x_mask.shape[1], 
        si=0, gater_below=gater_below, reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x)), 
                x_mask, bs=x_mask.shape[1], 
                si=si, state_below=encoder_acts[-1], 
                gater_below=gater_below,
                reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True,n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape([1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True,n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [add_t_op(emb_words_t[0](emb_t(y0)), 
            everything,
            y_mask, bs=y_mask.shape[1], 
            gater_below = gater_below,
            reseter_below = reseter_below,
            init_state=init_state[0], 
            si=0)]
    for si in xrange(1,state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), 
                everything,
                y_mask, bs=y_mask.shape[1], 
                state_below = has_said[-1],
                gater_below = gater_below,
                reseter_below = reseter_below,
                init_state=init_state[si], 
                si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword = aword)

    nll = output_layer.train(state_below=model, target=y0,
                 mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), 
            si=0, 
            use_noise=False, 
            gater_below=gater_below,
            reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), 
                si=si, 
                state_below=encoder_acts[-1], use_noise=False,
                gater_below = gater_below, 
                reseter_below = reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(TT.reshape(bias_code[si](everything, 
                use_noise=False), [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x,use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0; word_tm1 = args[aidx]
        aidx += 1; prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1; has_said_tm1.append(args[aidx])
        aidx += 1; ctx = args[aidx]
        if state['avg_word']:
            aidx += 1; awrd = args[aidx]
        
        val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1,
                aword=awrd, one_step=True, use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(
                state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), 
                temp=temp, target=sample.reshape([1,1]), use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), 
                ctx,
                prev_val=has_said_tm1[0], 
                gater_below=gater_below,
                reseter_below=reseter_below,
                one_step=True, use_noise=True,
                si=0)]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), 
                    ctx,
                    prev_val=has_said_tm1[si], 
                    gater_below=gater_below,
                    reseter_below=reseter_below,
                    one_step=True, use_noise=True,
                    si=si, state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
            states = states,
            params = sampler_params,
            n_steps= n_steps,
            name='sampler_scan'
            )
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function(
        [n_steps, temp, x], [samples, probs.sum()],
        updates=updates,
        profile=False, name='sample_fn')

    model = LM_Model(
        cost_layer = nll,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        sample_fn  = sample_fn,
        clean_before_noise_fn = False,
        noise_fn = noise_fn,
        indx_word=state['indx_word_target'],
        indx_word_src=state['indx_word'],
        character_level = False,
        rng = rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:,idx].shape[0]):
                        print model.word_indxs_src[x[:,idx][k]],
                        if model.word_indxs_src[x[:,idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:,idx].shape[0]):
                        print model.word_indxs[y[:,idx][k]],
                        if model.word_indxs[y[:,idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:,idx])
                    if len(numpy.where(masks[:,idx]==0)[0]) > 0:
                        senlen = numpy.where(masks[:,idx]==0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen']+1,  1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen']+1,  1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data, valid_data, None, model, algo, state, channel,
            reset = state['reset'], hooks = hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word=pkl.load(open(state['word_indx'],'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen+1, dtype='int64')
                    for idx,sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass

Example #25

Show file

    def init_cpu(self, options, channel, data, model):
        n_params = len(self.model.params)
        # Step 1. Compile function for computing eucledian gradients
        self.reset_gradients = theano.function(
            [],
            [],
            updates = zip(self.gs, [TT.zeros_like(g) for g in self.gs]),
            on_unused_input='warn',
            mode=cpu_mode,
            name='reset_gradients',
            profile=options['profile'])

        gbdx = TT.iscalar('grad_batch_idx')
        comp_grad = TT.iscalar('comp_grad')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))
        cst = time.time()
        def grad_step(*args):

            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)]
            _gs = [x for x in gs]
            _nw_gs = [gpu_from_host(g) for g in nw_gs]
            nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True)
            nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)]
            return [args[0] + const(1), args[1] + nw_cost] + nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        cost0 = TT.unbroadcast(const([0]),0)
        n_steps = TT.iscalar('nsteps')
        rvals, updates = scan(grad_step,
                              states=[idx0, cost0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / TT.cast(n_steps, 'float32') for x in rvals[2: 2 + n_params]]
        nw_gs = [og + nwg for og, nwg in zip(self.gs, nw_gs)]
        fcost = rvals[1][0] / TT.cast(n_steps, 'float32')
        updates.update(dict(zip(self.gs, nw_gs)))

        grad_inps = zip(loc_inputs, self.shared_data)
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx, comp_grad, n_steps],
            fcost,
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])
        print 'Time to compile grad', print_time(time.time() - cst)
        cst = time.time()
        def jacob_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']:(idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            mode=cpu_mode
            params = model.cpu_params
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):

                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                    denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1:1+n_params], params)]
            return [args[0] + const(1)] + nw_js

        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['mbs'] // options['cbs']
        mode = cpu_mode
        rvals, updates = scan(jacob_step,
                              states=[idx0] + ij,
                              n_steps=n_steps,
                              name='jacob_loop',
                              mode=mode,
                              profile=options['profile'])

        nw_js = [x[0] for x in rvals[1:1+n_params]]
        updates.update(dict(zip(self.js, nw_js)))
        grad_inps = [(x, y[gbdx*options['mbs']:(gbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs[:1], self.cpu_shared_data[:1])]

        print 'Compiling grad function'
        self.compute_jacobi_preconditioner = theano.function(
            [gbdx],
            [],
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='jacobi_preconditioner_gradients',
            mode=mode,
            profile=options['profile'])
        print 'Time compile jacobi ', print_time(time.time() - cst)
        cst = time.time()
        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        mode = cpu_mode
        def compute_Gv(*args):
            cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                 name ='cgv%d'%idx)
                       for idx, shp in enumerate(model.params_shape)]
            print_mem('allocated mem for cgv')
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=gpu_mode,
                                  name='Gv_step',
                                  profile=options['profile'])
            final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
            grad_inps = zip(loc_inputs, self.shared_data)
            loc_fn = theano.function([],
                                     final_Gvs,
                                     updates = updates,
                                     givens = dict(grad_inps),
                                     on_unused_input='warn',
                                     mode=gpu_mode,
                                     name='loc_fn',
                                     profile = options['profile'])
            fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

            return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        mreg = TT.scalar('mreg')
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift = - mreg,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [mreg],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        nw_ps = [p - lr * r for p, r in zip(model.cpu_params, self.rs)]
        nw_ds = [ -r for r in self.rs]

        self.update_cparams = theano.function(
            [lr], updates = dict(zip(model.cpu_params, nw_ps)),
            name='update_cparam',
            allow_input_downcast=True,
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        newparams = [y.type.filter_variable(x) for x,y in zip(nw_ps,
                                                              model.params)]
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=cpu_mode,
                                             profile=options['profile'])
        self.scalar_grad = theano.function(
            [],
            sum(TT.sum(x*y) for x,y in zip(self.gs, self.ds)),
            name='scalar_grad',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        nsteps = self.options['ebs'] // self.options['cbs']
        self.current_alpha = numpy.inf
        def ls_cost(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            return self.compute_eucledian_gradients(pos, 0, nsteps)
        self.ls_cost_fn = ls_cost

        def ls_grad(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            self.reset_gradients()
            self.compute_eucledian_gradients(pos, 1, nsteps)
            return self.scalar_grad()
        self.ls_grad_fn = ls_grad

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            nw_cost = \
                  TT.cast(safe_clone(model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                           ferr,
                           givens=dict(zip(loc_inputs, self.cpu_shared_data)),
                           name='compute_err',
                           mode=cpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)

        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0

Example #26

Show file

File: minres.py Project: gdesjardins/MFNG

def minres(compute_Av,
           bs,
           rtol=npy_floatX(1e-6),
           maxiter=20,
           Ms=None,
           damp=npy_floatX(0.),
           maxxnorm=npy_floatX(1e15),
           Acondlim=npy_floatX(1e16),
           mode = None,
           xinit = None,
           profile=0):
    """
     DESCRIPTION:
         minres attempts to find the minimum-length and minimum-residual-norm
         solution x to the system of linear equations A*x = b or
         least squares problem min||Ax-b||.  The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible).
         The right-hand-side column vector b must have length n.

     INPUTS:
        :param compute_Av: callable returing the symbolic expression for
            `Av`. `v` can be a set of parameteres
        :param bs: list of Theano expressions. We are looking to compute
            A^-1\dot bs
        :param rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        :param maxiter: Optional, positive integer, specifies the maximum number of
            iterations. Default is 20
        :param Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        :param damp: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A + damp I) * x = b.
        :param maxxnorm: real positive, maximum bound on NORM(x). Default is 1e14.
        :param Acondlim: real positive, maximum bound on COND(A). Default is 1e15.
        :param xinit: None, or list of ndarrays (of same length as bs) containing initial guess
        for x[i].

     OUTPUTS:
        x       n-vector, estimated solution
        flag    integer, convergence flag
               -1  beta2 = 0.  If M = I, b and x are eigenvectors.
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9 It is a least squares problem but no converged solution yet.
        iter    integer, iteration number at which x was computed: 0 <= iter <= maxiter.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

    EXAMPLE 1:
         n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n);
         b = sum(A,2); rtol = 1e-10; maxiter = 50; M = spdiags(4*on,0,n,n);
         x = minresSOL69(A, b, rtol, maxiter, M);

         Use this matrix-vector product function
            function y = afun(x,n)
            y = 4 * x;
            y(2:n) = y(2:n) - 2 * x(1:n-1);
            y(1:n-1) = y(1:n-1) - 2 * x(2:n);
         as input to minresSOL69
            x1 = minresSOL69(@afun, b, rtol, maxiter, M);

     EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite.
          n = 50; N = n^2; on=ones(n,1);   B = spdiags([on on on], -1:1, n, n);
          A = sparse([],[],[],N,N,(3*n-2)^2);
          for i=1:n
              A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B;
              if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end;
              if (i-2)*n+1 > 0  A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B;  end;
          end
          b = sum(A,2);   rtol = 1e-5;   maxxnorm = 1e2;
          damp = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show);

     EXAMPLE 3: A is diagonal, singular and indefinite.
          h = 1;  a = -10; b = -a; n = 2*b/h + 1;
          A = spdiags((a:h:b)', 0, n, n);
          b = ones(n,1);   rtol = 1e-6;   maxxnorm = 1e2;
          damp = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show);



     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = npy_floatX(1e-23)

    # Initialise
    flag = theano.shared(npy_floatX(0.))

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    if xinit is None:
        xinit = [TT.zeros_like(b) for b in bs]
        r3s = [b for b in bs]
        r2s = [b for b in bs]
        r1s = [b for b in bs]
        beta1 = norm(bs)
        if Ms is not None:
            r3s = [b/m for b,m in zip(bs,Ms)]
            beta1 = norm(r3s, bs)
    else:
        init_Ax = compute_Av(*xinit)
        res = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        r3s = copy.copy(res)
        r2s = copy.copy(res)
        r1s = copy.copy(res)
        beta1 = norm(res)
        if Ms is not None:
            r3s = [r/m for r,m in zip(r3s, Ms)]
            beta1 = norm(r3s, res)

    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3/beta for r3 in r3s]
        r3s = compute_Av(*vs)
        r3s = [r3 + damp*v for r3,v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, numpy.float64(1.)),
                         r3 - (beta/betal)*r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = sqnorm(r3s, vs)
        r3s = [r3 - (alpha/beta)*r2 for r3,r2 in zip(r3s,r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3/M for r3, M in zip(r3s, Ms)]
            betan = norm(r2s, r3s)
        else:
            betan = norm(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, npy_floatX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))


        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs*dbar + sn*alpha
        gbar = sn*dbar - cs*alpha

        eplnn = sn*betan
        dbarn = - cs*betan;

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal  = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs*phi
        phi = sn*phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, npy_floatX(0.)),
                        (v - epln*dl2 - dlta*dl)/gamma,
                        v)
              for v,dl2,dl in zip(vs,dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma,npy_floatX(0.)),
                           norm(ds),
                           TT.constant((npy_floatX(numpy.inf))))


        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau*d for x,d in zip(xs,ds)]

        xnorm = norm(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2,
                        x) for dl2,x in zip(dl2s,xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         npy_floatX(6.), flag)
        # Estimate various norms
        rnorml      = rnorm # ||r_{k-1}||
        Anorml      = Anorm
        Acondl      = Acond
        relrnorml   = relrnorm
        flag_no_6 = TT.neq(flag, npy_floatX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, norm(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm*xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, npy_floatX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml*rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = npy_floatX(1) + relrnorm
        t2 = npy_floatX(1) + relArnorml
        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, npy_floatX(0.)),
                          TT.eq(flag, npy_floatX(6.))),
                      TT.switch(TT.le(t1, npy_floatX(1.)),
                                npy_floatX(3.),
                      TT.switch(TT.le(t2, npy_floatX(1.)),
                                npy_floatX(4.),
                      TT.switch(TT.le(relrnorm, rtol),
                                npy_floatX(1.),
                      TT.switch(TT.le(Anorm, npy_floatX(1e-20)),
                                npy_floatX(12),
                      TT.switch(TT.le(relArnorml, rtol),
                                npy_floatX(10.),
                      TT.switch(TT.ge(epsx, beta1),
                                npy_floatX(5.),
                      TT.switch(TT.ge(xnorm, maxxnorm),
                                npy_floatX(6.),
                      TT.switch(TT.ge(niter, TT.cast(maxiter,floatX)),
                                npy_floatX(8.),
                                flag)))))))),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol*Anorm*xnorm),
                               npy_floatX(11.), flag)
        return [
            niter + npy_floatX(1.),
            beta,
            betan,
            phi,
            Acond,
            cs,
            dbarn,
            eplnn,
            rnorm,
            sn,
            Tnorm,
            rnorml,
            xnorm,
            Dnorm,
            gamma,
            pnorm,
            gammal,
            Axnorm,
            relrnorm,
            relArnorml,
            Anorm,
            flag] + xs + r1s + r2s + r3s + dls + ds, \
                theano.scan_module.scan_utils.until(TT.neq(flag,0))

    states = []
    # 0 niter
    states.append(TT.constant(npy_floatX([0])))
    # 1 beta
    states.append(TT.constant(npy_floatX([0])))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 4 Acond
    states.append(TT.constant(npy_floatX([1])))
    # 5 cs
    states.append(TT.constant(npy_floatX([-1])))
    # 6 dbarn
    states.append(TT.constant(npy_floatX([0])))
    # 7 eplnn
    states.append(TT.constant(npy_floatX([0])))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 9 sn
    states.append(TT.constant(npy_floatX([0])))
    # 10 Tnorm
    states.append(TT.constant(npy_floatX([0])))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 12 xnorm
    states.append(TT.constant(npy_floatX([0])))
    # 13 Dnorm
    states.append(TT.constant(npy_floatX([0])))
    # 14 gamma
    states.append(TT.constant(npy_floatX([0])))
    # 15 pnorm
    states.append(TT.constant(npy_floatX([0])))
    # 16 gammal
    states.append(TT.constant(npy_floatX([0])))
    # 17 Axnorm
    states.append(TT.constant(npy_floatX([0])))
    # 18 relrnorm
    states.append(TT.constant(npy_floatX([1])))
    # 19 relArnorml
    states.append(TT.constant(npy_floatX([1])))
    # 20 Anorm
    states.append(TT.constant(npy_floatX([0])))
    # 21 flag
    states.append(TT.constant(npy_floatX([0])))

    xs  = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    ds  = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    dls = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1),0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2),0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3),0) for r3 in r3s]

    rvals, lupds = scan(loop,
                    states = states + xs + r1s + r2s + r3s + dls + ds,
                    n_steps = maxiter + numpy.int32(1),
                    name='minres',
                    profile=profile,
                    mode=mode)

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22+n_params]]
    return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm

Example #27

Show file

        def compute_Gv(*args):
            cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                 name ='cgv%d'%idx)
                       for idx, shp in enumerate(model.params_shape)]
            print_mem('allocated mem for cgv')
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=gpu_mode,
                                  name='Gv_step',
                                  profile=options['profile'])
            final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
            grad_inps = zip(loc_inputs, self.shared_data)
            loc_fn = theano.function([],
                                     final_Gvs,
                                     updates = updates,
                                     givens = dict(grad_inps),
                                     on_unused_input='warn',
                                     mode=gpu_mode,
                                     name='loc_fn',
                                     profile = options['profile'])
            fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

            return fake_op(*args), {}

Example #28

Show file

def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile = False,
          mode=theano.Mode(linker='cvm')):
    """
    TODO: re-write me

    Part of the optimization algorithm in `scalar_search_wolfe2`.
    a_lo : scalar (step size)
    a_hi : scalar (step size)
    phi_lo : scalar (value of f at a_lo)
    phi_hi : scalar ( value of f at a_hi)
    derphi_lo : scalar ( value of derivative at a_lo)
    phi : callable -> generates computational graph
    derphi: callable -> generates computational graph
    phi0 : scalar ( value of f at 0)
    derphi0 : scalar (value of the derivative at 0)
    c1 : scalar  (wolfe parameter)
    c2 : scalar  (wolfe parameter)
    profile: if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi-a_lo
        a = TT.switch( dalpha < zero, a_hi, a_lo)
        b = TT.switch( dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1*dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2*dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad)


        # pick between the two ..
        cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b -
                                                            cchk, a_j_cubic
                                                            < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                         phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2*derphi0)


        cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj*(a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse( cond1, phi_hi,
                            TT.switch( cond2, phi_hi, phi_lo), name =
                         'phi_rec')
        a_rec   = ifelse( cond1, a_hi,
                            TT.switch( cond2, a_hi, a_lo), name='a_rec')
        a_hi    = ifelse( cond1, a_j,
                            TT.switch( cond2, a_lo, a_hi), name='a_hi')
        phi_hi  = ifelse( cond1, phi_aj,
                            TT.switch( cond2, phi_lo, phi_hi), name='phi_hi')

        a_lo      = TT.switch(cond1, a_lo, a_j)
        phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj,
                                                  nan), name='valprime')

        return ( [ phi_rec,
                  a_rec,
                  a_lo,
                  a_hi,
                  phi_hi,
                  phi_lo,
                  derphi_lo,
                  a_star,
                  val_star,
                  valprime],
                theano.scan_module.scan_utils.until(stop) )

    maxiter = n_iters
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))  # cubic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))  # quadratic interpolant check
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi-a_lo
    a = TT.switch( dalpha < zero, a_hi, a_lo)
    b = TT.switch( dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection


    # quadric interpolation
    qchk = delta2*dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a +
                                                  qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j)


    # Check new value of a_j

    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)



    cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj*(a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse( cond1, phi_hi,
                        TT.switch( cond2, phi_hi, phi_lo), name='mphirec')
    a_rec   = ifelse( cond1, a_hi,
                        TT.switch( cond2, a_hi, a_lo), name='marec')
    a_hi    = ifelse( cond1, a_j,
                        TT.switch( cond2, a_lo, a_hi), name='mahi')
    phi_hi  = ifelse( cond1, phi_aj,
                        TT.switch( cond2, phi_lo, phi_hi), name='mphihi')

    onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                       phi_aj < phi_lo),
                       abs(derphi_aj) <= -c2*derphi0)

    a_lo      = TT.switch(cond1, a_lo, a_j)
    phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    print'while_zoom'
    outs, updates = scan(while_zoom,
                         states = states,
                         n_steps = maxiter,
                         name = 'while_zoom',
                         mode = mode,
                         profile = profile)
    print 'done_while'
    a_star   = ifelse(onlyif, a_j   , outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime

Example #29

Show file

def krylov_subspace(compute_Av,
                    bs,
                    old_dir,
                    iters=20,
                    param_shapes=None,
                    profile=0,
                    device='gpu'):
    eps = numpy.float32(1e-20)
    bs = [b / tensor.sqrt((b**2).sum() + eps) for b in bs]
    mem_bufs = [
        tensor.alloc(zero, iters, *param_sh) for param_sh in param_shapes
    ]
    mem_bufs = [
        tensor.set_subtensor(mem[0], b) for mem, b in zip(mem_bufs, bs)
    ]

    def construct_space(*args):
        vs, updates = compute_Av(*args)
        # I need to rescale at every point, otherwise if A is damping, these
        # vs go quickly to 0 and we loose the direction they represent
        norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20)
        vs = [v / norm for v in vs]
        return vs, updates

    if device == 'gpu':
        mode = gpu_mode
    else:
        mode = cpu_mode
    outs, updates = scan(construct_space,
                         states=mem_bufs,
                         n_steps=iters - 2,
                         name='krylov_space',
                         mode=mode,
                         profile=profile)
    if not isinstance(outs, (list, tuple)):
        outs = [outs]
    outs = [
        tensor.set_subtensor(out[iters - 1], o)
        for out, o in zip(outs, old_dir)
    ]
    outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in outs]
    param_lengths = [numpy.prod(shp) for shp in param_shapes]

    def ortho(idx, *ortho_mats):
        new_ortho_mats = []
        for A, param_length in zip(ortho_mats, param_lengths):
            weight = tensor.dot(
                A[idx + 1:].reshape((iters - idx - 1, param_length)),
                A[idx].reshape((param_length, )))
            A_reshuffle = ['x'] + list(range(A[idx].ndim))
            W_reshuffle = [0] + ['x'] * A[idx].ndim
            to_remove = weight.dimshuffle(*W_reshuffle) *\
                        A[idx].dimshuffle(*A_reshuffle)
            new_A = tensor.set_subtensor(A[idx + 1:], A[idx + 1:] - to_remove)
            x_col = new_A[idx + 1]
            x_col = x_col / tensor.sqrt((x_col**2).sum() + eps)
            new_A = tensor.set_subtensor(new_A[idx + 1], x_col)
            new_ortho_mats.append(new_A)
        return new_ortho_mats

    rvals, _ = scan(ortho,
                    sequences=tensor.constant(numpy.arange(iters - 1)),
                    states=outs,
                    n_steps=iters - 1,
                    name='ortho',
                    profile=profile,
                    mode=mode)
    if not isinstance(rvals, (list, tuple)):
        rvals = [rvals]
    rvals = [rval[0] * .1 for rval in rvals]
    return rvals, updates

Example #30

Show file

File: linesearch.py Project: EderSantana/pylearn2

def scalar_search_wolfe2(phi,
                         derphi,
                         phi0=None,
                         old_phi0=None,
                         derphi0=None,
                         n_iters=20,
                         c1=1e-4,
                         c2=0.9,
                        profile=False):
    """
    Find alpha that satisfies strong Wolfe conditions.

    alpha > 0 is assumed to be a descent direction.

    Parameters
    ----------
    phi : callable f(x)
        Objective scalar function.
    derphi : callable f'(x)
        Objective function derivative (can be None)
    phi0 : float, optional
        Value of phi at s=0
    old_phi0 : float, optional
        Value of phi at previous point
    derphi0 : float, optional
        Value of derphi at s=0
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
    profile : flag (boolean)
        True if you want printouts of profiling information

    Returns
    -------
    alpha_star : float
        Best alpha
    phi_star: WRITEME
        phi at alpha_star
    phi0: WRITEME
        phi at 0
    derphi_star: WRITEME
        derphi at alpha_star

    Notes
    -----
    Uses the line search algorithm to enforce strong Wolfe
    conditions.  See Wright and Nocedal, 'Numerical Optimization',
    1999, pg. 59-60.

    For the zoom phase it uses an algorithm by [...].

    """

    if phi0 is None:
        phi0 = phi(zero)
    else:
        phi0 = phi0

    if derphi0 is None and derphi is not None:
        derphi0 = derphi(zero)
    else:
        derphi0 = derphi0

    alpha0 = zero
    alpha0.name = 'alpha0'
    if old_phi0 is not None:
        alpha1 = TT.minimum(one,
                            numpy.asarray(1.01, dtype=theano.config.floatX) *
                            numpy.asarray(2, dtype=theano.config.floatX) * \
                            (phi0 - old_phi0) / derphi0)
    else:
        old_phi0 = nan
        alpha1 = one

    alpha1 = TT.switch(alpha1 < zero, one, alpha1)
    alpha1.name = 'alpha1'

    # This shouldn't happen. Perhaps the increment has slipped below
    # machine precision?  For now, set the return variables skip the
    # useless while loop, and raise warnflag=2 due to possible imprecision.
    phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0)
    # I need a lazyif for alpha1 == 0 !!!
    phi_a1 = ifelse(TT.eq(alpha1, zero), phi0,
                    phi(alpha1), name='phi_a1')
    phi_a1.name = 'phi_a1'

    phi_a0 = phi0
    phi_a0.name = 'phi_a0'
    derphi_a0 = derphi0
    derphi_a0.name = 'derphi_a0'
    # Make sure variables are tensors otherwise strange things happen
    c1 = TT.as_tensor_variable(c1)
    c2 = TT.as_tensor_variable(c2)
    maxiter = n_iters

    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2 * derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name='alphastar_c3'),
                      name='alphastar_c2'),
                      name='alphastar_c1')

        return ([alpha1,
                 nw_alpha1,
                 phi_a1,
                 ifelse(lazy_or('allconds',
                                cond1,
                                cond2,
                                cond3),
                        phi_a1,
                        nw_phi,
                        name='nwphi1'),
                 ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                 i_t + one,
                 alpha_star,
                 phi_star,
                 derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',
                            TT.eq(nw_alpha1, zero),
                            cond1,
                            cond2,
                            cond3)))
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)]
    # i_t
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # alpha_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # phi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # derphi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # print 'while_search'
    outs, updates = scan(while_search,
                         states=states,
                         n_steps=maxiter,
                         name='while_search',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while_search'
    out3 = outs[-3][0]
    out2 = outs[-2][0]
    out1 = outs[-1][0]
    alpha_star, phi_star, derphi_star = \
            ifelse(TT.eq(alpha1, zero),
                        (nan, phi0, nan),
                        (out3, out2, out1), name='main_alphastar')
    return alpha_star, phi_star,  phi0, derphi_star

Example #31

Show file

File: natSGD.py Project: cc13ny/galatea

            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(cgv, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

Example #32

Show file

File: natSGD.py Project: cc13ny/galatea

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']],
                               name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
        else:
            # Store eucledian gradients
            self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode=gpu_mode
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(args, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(cgv, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}



        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            rtol=options['mrtol'],
            shift= -options['mreg'],
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            final_cost,
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])

Example #33

Show file

File: lincg.py Project: gdesjardins/DBM

def linear_cg(compute_Ax, b, M=None, xinit=None, rtol=1e-16, maxiter=100000, damp=0.0, floatX=None):
    """
    Solves the system A x[i] = b[i], for all i.
    
    When used as part of a Newton-CG method, b is a list of gradients, where each element of
    this list represents a gradient for a given parameter type (i.e. weight or bias of a given
    layer). This method will return a list whose elements approximates A^{-1} b[i], with the
    precision determined by maxiter or the specified tolerance level. This particular
    version implements the Polyak-Ribiere flavor of CG.

    Parameters:
    :param compute_Ax: python function which symbolically computes the matrix-vector product.
    :param b: list of T.vector, corresponding to A x[i] = b[i]
    :param M: list of T.vector (same length as b). Each element is used to precondition its
    corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements
    of A, this will implement Jacobi preconditioning.
    :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i].
    :param rtol: float. CG will stop when the norm of the residual error < rtol.
    :param maxiter: int. Maximum allowable iterations for CG.
    :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A.
    :param floatX: 'float32' or 'float64'.

    Return values:
    rval[0]: niter, number of iterations run by CG
    rval[1]: residual error norm.
    rval[2+i]: approximate value for G^-1 b[i].

    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(b)

    def loop(niter, rkp_norm, *args):
        pk = args[:n_params]
        rk = args[n_params : 2 * n_params]
        zk = args[2 * n_params : 3 * n_params]
        xk = args[-n_params:]
        A_pk_temp = compute_Ax(*pk)
        A_pk = [A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk)]
        alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk))
        alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk))
        alphak = alphak_num / alphak_denum
        xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)]
        rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)]
        if M:
            zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)]
        else:
            zkp1 = rkp1
        # compute beta_k using Polak-Ribiere
        betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum() for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1))
        betak_denum = alphak_num
        betak = betak_num / betak_denum
        pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)]
        # compute termination critera
        rkp1_norm = sum((rkp1_ ** 2).sum() for rkp1_ in rkp1)
        return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1, theano.scan_module.until(abs(rkp1_norm) < rtol)

    # Initialize residual based on xinit
    if xinit is None:
        r0_temp = b
        x0 = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_))) for b_ in b]
    else:
        init_Ax = compute_Ax(*xinit)
        r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))]
        x0 = [tensor.unbroadcast(tensor.shape_padleft(xinit_)) for xinit_ in xinit]

    # Leftpad r0, z0 and p0 for scan.
    r0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_)) for r0_temp_ in r0_temp]
    if M:
        z0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_)) for r0_temp_, m_ in zip(r0_temp, M)]
    else:
        z0 = r0
    p0 = z0

    states = []
    # 0 niter
    states.append(tensor.constant(npy_floatX([0])))
    # 1 residual error norm
    states.append(tensor.constant(npy_floatX([0])))

    outs, updates = scan(
        loop,
        states=states + p0 + r0 + z0 + x0,
        n_steps=maxiter,
        mode=theano.Mode(linker="c|py"),
        name="linear_conjugate_gradient",
        profile=0,
    )
    sol = [x[0] for x in outs[-n_params:]]
    niter = outs[0][0]
    rerr = outs[1][0]
    return [sol, niter, rerr]

Example #34

Show file

File: krylov_lbfgs.py Project: cc13ny/galatea

    def __init__(self,
                 options,
                 channel,
                 data,
                 model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the krylov
                    subspace
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lbfgsIters' -> int
                `krylovDim` -> int
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data
        xdata = theano.shared(data['train_x'],
                              name='xdata')
        ydata = theano.shared(data['train_y'],
                          name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        rng = numpy.random.RandomState(options['seed'])
        self.rng = rng
        self.options = options
        self.channel = channel
        self.model = model
        n_dimensions = options['krylovDim']
        self.n_dimensions = n_dimensions
        if options['device']=='gpu':
            cfn_subspaces = \
                [theano.shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [theano.shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        else:
            cfn_subspaces = \
                [TT._shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [TT._shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.cfn_subspaces = cfn_subspaces
        self.old_deltas = old_deltas

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        print 'Constructing grad function'
        loc_inputs = [x.type(name='locx') for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs,
                        [x[gdx*options['gbs']:(gdx+1)*options['gbs']] for x
                         in shared_data])
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        if options['device'] == 'gpu':
            mode=gpu_mode
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, cgv))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}



        rvals, updates = krylov_subspace(
            compute_Gv,
            self.gs,
            old_deltas,
            n_dimensions,
            model.params_shape,
            profile=options['profile'],
            device=options['device'])

        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs,
                        [x[gdx*options['mbs']:(gdx+1)*options['mbs']] for x
                         in shared_data])
        updates.update(dict(zip(cfn_subspaces, rvals)))
        self.update_krylov_subspace = theano.function(
            [gdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            profile=options['profile'],
            on_unused_input='warn',
            name='update_krylov_subspace',
            mode=mode)

        alphas = tensor.vector('alphas')
        deltas = []
        nw_params = []
        if options['device'] == 'gpu':
            params = model.params
        else:
            params = model.cpu_params

        for param, subspace in zip(params, cfn_subspaces):
            alpha_reshuffle = [0] + ['x'] * param.ndim
            delta = (alphas.dimshuffle(*alpha_reshuffle) * \
                        subspace).sum(axis=0)
            nw_param = param + delta
            nw_params.append(nw_param)
            deltas.append(delta)

        print 'constructing evaluation function'
        ebdx = TT.iscalar('ebdx')

        updates_dict = dict(zip(model.params + old_deltas,
                                nw_params + deltas))
        if options['device'] != 'gpu':
            updates_dict.update(dict(zip(model.cpu_params, nw_params)))

        self.update_params = theano.function([alphas],
                                             updates = updates_dict,
                                             name='update_params',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps +
                               nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps +
                               nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, alphas)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.zeros((1, n_dimensions),dtype='float32'))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        mode = gpu_mode,
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)

        grad_inps = zip(loc_inputs,
                        [x[ebdx*options['ebs']:(ebdx+1)*options['ebs']] for x
                         in shared_data])
        self.lbfgs_fn = theano.function([alphas, ebdx],
                                   #theano.printing.Print('fcost')(fcost),
                                    fcost,
                                   givens=grad_inps,
                                   allow_input_downcast=True,
                                   on_unused_input='warn',
                                   name='lbfgs_fn',
                                   profile=options['profile'],
                                   mode=gpu_mode)
        self.lbfgs_grad = theano.function([alphas, ebdx],
                                     fgrad,
                                     givens=grad_inps,
                                     on_unused_input='warn',
                                     allow_input_downcast=True,
                                     name='lbfgs_grad',
                                     profile=options['profile'],
                                     mode=gpu_mode)

        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                           ferr,
                           givens=dict(zip(loc_inputs, shared_data)),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])

Example #35

Show file

File: natNCG.py Project: vd114/galatea

    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              Ms=self.js,
                              rtol=options['mrtol'],
                              shift=self.damping,
                              maxit=options['miters'],
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)])
        norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)), beta_k)

        nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0, beta_k
            ],
            updates=updates,
            allow_input_downcast=True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [-r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates=dict(
                                                 zip(model.params, newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function(
            [],
            updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])),
            name='reset_dirs',
            on_unused_input='warn',
            mode=cpu_mode,
            allow_input_downcast=True,
            profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function([lr, ebdx],
                                          fcost,
                                          givens=grad_inps,
                                          allow_input_downcast=True,
                                          name='ls_cost_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.approx_change = theano.function(
            [lr],
            -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]),
            allow_input_downcast=True,
            name='approx_change',
            mode=gpu_mode,
            profile=options['profile'])

        self.ls_grad_fn = theano.function([lr, ebdx],
                                          fgrad,
                                          allow_input_downcast=True,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=cpu_mode,
                                             allow_input_downcast=True,
                                             on_unused_input='warn',
                                             profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0

Example #36

Show file

File: krylov_lbfgs.py Project: cc13ny/galatea

def krylov_subspace(compute_Av,
                    bs,
                    old_dir,
                    iters=20,
                    param_shapes=None,
                    profile=0,
                    device='gpu'):
    eps = numpy.float32(1e-20)
    bs = [b / tensor.sqrt((b ** 2).sum()+eps) for b in bs]
    mem_bufs = [tensor.alloc(zero, iters, *param_sh) for
           param_sh in param_shapes]
    mem_bufs = [tensor.set_subtensor(mem[0], b)
                    for mem, b in zip(mem_bufs, bs)]

    def construct_space(*args):
        vs, updates = compute_Av(*args)
        # I need to rescale at every point, otherwise if A is damping, these
        # vs go quickly to 0 and we loose the direction they represent
        norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20)
        vs = [v / norm for v in vs]
        return vs, updates
    if device == 'gpu':
        mode = gpu_mode
    else:
        mode = cpu_mode
    outs, updates = scan(construct_space,
                   states=mem_bufs,
                   n_steps=iters - 2,
                   name='krylov_space',
                   mode=mode,
                   profile=profile)
    if not isinstance(outs, (list, tuple)):
        outs = [outs]
    outs = [tensor.set_subtensor(out[iters - 1], o)
                for out, o in zip(outs, old_dir)]
    outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0)
                for x in outs]
    param_lengths = [numpy.prod(shp) for shp in param_shapes]

    def ortho(idx, *ortho_mats):
        new_ortho_mats = []
        for A, param_length in zip(ortho_mats, param_lengths):
            weight = tensor.dot(A[idx + 1:].reshape(
                (iters - idx - 1, param_length)),
                A[idx].reshape((param_length,)))
            A_reshuffle = ['x'] + list(range(A[idx].ndim))
            W_reshuffle = [0] + ['x'] * A[idx].ndim
            to_remove = weight.dimshuffle(*W_reshuffle) *\
                        A[idx].dimshuffle(*A_reshuffle)
            new_A = tensor.set_subtensor(A[idx + 1:],
                                         A[idx + 1:] - to_remove)
            x_col = new_A[idx + 1]
            x_col = x_col / tensor.sqrt((x_col ** 2).sum()+eps)
            new_A = tensor.set_subtensor(new_A[idx + 1], x_col)
            new_ortho_mats.append(new_A)
        return new_ortho_mats
    rvals, _ = scan(ortho,
                    sequences=tensor.constant(numpy.arange(iters - 1)),
                    states=outs,
                    n_steps=iters - 1,
                    name='ortho',
                    profile=profile,
                    mode=mode)
    if not isinstance(rvals, (list, tuple)):
        rvals = [rvals]
    rvals = [rval[0]*.1 for rval in rvals]
    return rvals, updates

Example #37

Show file

def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(rng,
                     n_in=state['nins'],
                     n_hids=[state['rank_n_approx']],
                     activation=[state['rank_n_activ']],
                     init_fn=state['weight_init_fn'],
                     weight_noise=state['weight_noise'],
                     scale=state['weight_scale'],
                     name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_%d' % si))
        if state['rec_gating']:
            gater_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_%d' % si))
        if state['rec_reseting']:
            reseter_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_%d' % si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_reseter_%d' % si))

        add_rec_step.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_%d' % si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si=0,
                state_below=None,
                gater_below=None,
                reseter_below=None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0] // bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si - 1](state_below,
                                     one_step=one_step,
                                     use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si - 1](state_below,
                                               one_step=one_step,
                                               use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval = add_rec_step[si](rval,
                                    nsteps=seqlen,
                                    batch_size=bs,
                                    mask=words_mask,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        else:
            rval = add_rec_step[si](rval,
                                    mask=words_mask,
                                    state_before=prev_val,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        return rval

    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(rng,
                       n_in=state['nouts'],
                       n_hids=[state['rank_n_approx']],
                       activation=[state['rank_n_activ']],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_t_%d' % si))
        if state['rec_gating']:
            gater_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_t_%d' % si))
        if state['rec_reseting']:
            reseter_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_t_%d' % si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='proj_everything_t_%d' % si,
                       learn_bias=False))
        if state['rec_gating']:
            gater_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='gater_everything_t_%d' % si,
                           learn_bias=False))
        if state['rec_reseting']:
            reseter_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='reseter_everything_t_%d' % si,
                           learn_bias=False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_t_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_reseter_%d' % si))

        add_rec_step_t.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_t_%d' % si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim'] * state['maxout_part']],
                           activation=['lambda x: x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='encoder_proj_%d' % si,
                           learn_bias=(si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                                    indim=indim,
                                    pieces=pieces,
                                    rng=rng)

    def _add_t_op(words_embeddings,
                  everything=None,
                  words_mask=None,
                  prev_val=None,
                  one_step=False,
                  bs=1,
                  init_state=None,
                  use_noise=True,
                  gater_below=None,
                  reseter_below=None,
                  si=0,
                  state_below=None):
        seqlen = words_embeddings.out.shape[0] // bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si - 1](state_below,
                                       one_step=one_step,
                                       use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si - 1](state_below,
                                                   one_step=one_step,
                                                   use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything,
                                                one_step=one_step,
                                                use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything,
                                                  one_step=one_step,
                                                  use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](rval,
                                      nsteps=seqlen,
                                      batch_size=bs,
                                      mask=words_mask,
                                      one_step=one_step,
                                      init_state=init_state,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        else:
            rval = add_rec_step_t[si](rval,
                                      mask=words_mask,
                                      state_before=prev_val,
                                      one_step=one_step,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        return rval

    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=[state['activ']],
                           bias_scale=[state['bias']],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           name='bias_code_%d' % si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(rng,
                               n_in=word_code_nin,
                               n_hids=[outdim],
                               activation='lambda x:x',
                               bias_scale=[state['bias_mlp'] / 3],
                               scale=state['weight_scale'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               learn_bias=False,
                               name='word_code')

    proj_code = MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[outdim],
                           activation='lambda x: x',
                           bias_scale=[state['bias_mlp'] / 3],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           learn_bias=False,
                           name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[outdim],
                       activation='lambda x: x',
                       bias_scale=[state['bias_mlp'] / 3],
                       scale=state['weight_scale'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       name='proj_h_%d' % si))

    if state['bigram']:
        proj_word = MultiLayer(rng,
                               n_in=state['rank_n_approx'],
                               n_hids=[outdim],
                               activation=['lambda x:x'],
                               bias_scale=[state['bias_mlp'] / 3],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(rng,
                                indim,
                                state['nouts'],
                                state['weight_scale'],
                                -1,
                                rank_n_approx=rank_n_approx,
                                rank_n_activ=rank_n_activ,
                                weight_noise=state['weight_noise'],
                                init_fn=state['weight_init_fn'],
                                name='out')

    def _pop_op(everything,
                accum,
                everything_max=None,
                everything_min=None,
                word=None,
                aword=None,
                one_step=False,
                use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1, state['decoder_stack']):
            rval += proj_h[si](accum[si],
                               one_step=one_step,
                               use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape(
                    [rshape[0] / shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                                  one_step=one_step,
                                  use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1],
                                                   outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x)),
               x_mask,
               bs=x_mask.shape[1],
               si=0,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x)),
                   x_mask,
                   bs=x_mask.shape[1],
                   si=si,
                   state_below=encoder_acts[-1],
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True, n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape(
            [1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True, n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape(
            [shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [
        add_t_op(emb_words_t[0](emb_t(y0)),
                 everything,
                 y_mask,
                 bs=y_mask.shape[1],
                 gater_below=gater_below,
                 reseter_below=reseter_below,
                 init_state=init_state[0],
                 si=0)
    ]
    for si in xrange(1, state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(
            add_t_op(emb_words_t[si](emb_t(y0)),
                     everything,
                     y_mask,
                     bs=y_mask.shape[1],
                     state_below=has_said[-1],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     init_state=init_state[si],
                     si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword=aword)

    nll = output_layer.train(
        state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(
            y.shape[0] * y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x), use_noise=False),
               si=0,
               use_noise=False,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x), use_noise=False),
                   si=si,
                   state_below=encoder_acts[-1],
                   use_noise=False,
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]),
                                           use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(
                TT.reshape(bias_code[si](everything, use_noise=False),
                           [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x, use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0
        word_tm1 = args[aidx]
        aidx += 1
        prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1
            has_said_tm1.append(args[aidx])
        aidx += 1
        ctx = args[aidx]
        if state['avg_word']:
            aidx += 1
            awrd = args[aidx]

        val = pop_op(proj_code(ctx),
                     has_said_tm1,
                     word=word_tm1,
                     aword=awrd,
                     one_step=True,
                     use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(state_below=val.out.reshape(
            [1, TT.cast(output_layer.n_in, 'int64')]),
                                     temp=temp,
                                     target=sample.reshape([1, 1]),
                                     use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [
            add_t_op(emb_words_t[0](emb_t(sample)),
                     ctx,
                     prev_val=has_said_tm1[0],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     one_step=True,
                     use_noise=True,
                     si=0)
        ]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(
                add_t_op(emb_words_t[si](emb_t(sample)),
                         ctx,
                         prev_val=has_said_tm1[si],
                         gater_below=gater_below,
                         reseter_below=reseter_below,
                         one_step=True,
                         use_noise=True,
                         si=si,
                         state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
                            states=states,
                            params=sampler_params,
                            n_steps=n_steps,
                            name='sampler_scan')
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function([n_steps, temp, x],
                                [samples, probs.sum()],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    model = LM_Model(cost_layer=nll,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     sample_fn=sample_fn,
                     clean_before_noise_fn=False,
                     noise_fn=noise_fn,
                     indx_word=state['indx_word_target'],
                     indx_word_src=state['indx_word'],
                     character_level=False,
                     rng=rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:, idx].shape[0]):
                        print model.word_indxs_src[x[:, idx][k]],
                        if model.word_indxs_src[x[:, idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:, idx].shape[0]):
                        print model.word_indxs[y[:, idx][k]],
                        if model.word_indxs[y[:, idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:, idx])
                    if len(numpy.where(masks[:, idx] == 0)[0]) > 0:
                        senlen = numpy.where(masks[:, idx] == 0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen'] + 1, 1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen'] + 1, 1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data,
                    valid_data,
                    None,
                    model,
                    algo,
                    state,
                    channel,
                    reset=state['reset'],
                    hooks=hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word = pkl.load(open(state['word_indx'], 'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen + 1, dtype='int64')
                    for idx, sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass

Example #38

Show file

File: lincg.py Project: tony32769/DBM

def linear_cg(compute_Ax,
              b,
              M=None,
              xinit=None,
              rtol=1e-16,
              maxiter=100000,
              damp=0.,
              floatX=None):
    """
    Solves the system A x[i] = b[i], for all i.
    
    When used as part of a Newton-CG method, b is a list of gradients, where each element of
    this list represents a gradient for a given parameter type (i.e. weight or bias of a given
    layer). This method will return a list whose elements approximates A^{-1} b[i], with the
    precision determined by maxiter or the specified tolerance level. This particular
    version implements the Polyak-Ribiere flavor of CG.

    Parameters:
    :param compute_Ax: python function which symbolically computes the matrix-vector product.
    :param b: list of T.vector, corresponding to A x[i] = b[i]
    :param M: list of T.vector (same length as b). Each element is used to precondition its
    corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements
    of A, this will implement Jacobi preconditioning.
    :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i].
    :param rtol: float. CG will stop when the norm of the residual error < rtol.
    :param maxiter: int. Maximum allowable iterations for CG.
    :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A.
    :param floatX: 'float32' or 'float64'.

    Return values:
    rval[0]: niter, number of iterations run by CG
    rval[1]: residual error norm.
    rval[2+i]: approximate value for G^-1 b[i].

    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(b)

    def loop(niter, rkp_norm, *args):
        pk = args[:n_params]
        rk = args[n_params:2 * n_params]
        zk = args[2 * n_params:3 * n_params]
        xk = args[-n_params:]
        A_pk_temp = compute_Ax(*pk)
        A_pk = [
            A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk)
        ]
        alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk))
        alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk))
        alphak = alphak_num / alphak_denum
        xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)]
        rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)]
        if M:
            zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)]
        else:
            zkp1 = rkp1
        # compute beta_k using Polak-Ribiere
        betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum()
                        for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1))
        betak_denum = alphak_num
        betak = betak_num / betak_denum
        pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)]
        # compute termination critera
        rkp1_norm = sum((rkp1_**2).sum() for rkp1_ in rkp1)
        return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1,\
               theano.scan_module.until(abs(rkp1_norm) < rtol)

    # Initialize residual based on xinit
    if xinit is None:
        r0_temp = b
        x0 = [
            tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_)))
            for b_ in b
        ]
    else:
        init_Ax = compute_Ax(*xinit)
        r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))]
        x0 = [
            tensor.unbroadcast(tensor.shape_padleft(xinit_))
            for xinit_ in xinit
        ]

    # Leftpad r0, z0 and p0 for scan.
    r0 = [
        tensor.unbroadcast(tensor.shape_padleft(r0_temp_))
        for r0_temp_ in r0_temp
    ]
    if M:
        z0 = [
            tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_))
            for r0_temp_, m_ in zip(r0_temp, M)
        ]
    else:
        z0 = r0
    p0 = z0

    states = []
    # 0 niter
    states.append(tensor.constant(npy_floatX([0])))
    # 1 residual error norm
    states.append(tensor.constant(npy_floatX([0])))

    outs, updates = scan(loop,
                         states=states + p0 + r0 + z0 + x0,
                         n_steps=maxiter,
                         mode=theano.Mode(linker='c|py'),
                         name='linear_conjugate_gradient',
                         profile=0)
    sol = [x[0] for x in outs[-n_params:]]
    niter = outs[0][0]
    rerr = outs[1][0]
    return [sol, niter, rerr]

Example #39

Show file

    def __init__(
            self,
            nhids=50,
            nouts=8,
            nins=2,
            activ=TT.nnet.sigmoid,
            seed=234,
            bs=16,  # batchsize
            seqlen=3  # sequence length - fixed during training
    ):
        # 0. Keep track of arguments
        self.bs = bs
        self.nhids = nhids
        self.nouts = nouts
        self.nins = nins
        self.activ = activ
        self.seed = seed
        self.bs = bs
        self.seqlen = seqlen
        floatX = theano.config.floatX
        self.rng = numpy.random.RandomState(seed)

        # 1. Generating Theano variables
        # DenseSequence space
        # We store data as 3D tensor with (time, batch-size, nfeatures)
        self.x = TT.tensor3('x')
        # IndexSequence space
        # We store data as 1D tensor where each the dimension goes over the
        # batch size (i.e. target of each sequence in the batch)
        self.t = TT.ivector('t')  # target index for each element of batchsize
        self.inputs = [self.x, self.t]
        # Naming convention for letters after the `_`:
        # u - input
        # h - hidden
        # y - output
        # f - forward
        # b - backwards

        self.W_uhf = numpy.asarray(self.rng.normal(size=(self.nins,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=.01),
                                   dtype=floatX)
        self.W_uhb = numpy.asarray(self.rng.normal(size=(self.nins,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=.01),
                                   dtype=floatX)
        self.W_hhf = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=1),
                                   dtype=floatX)
        self.W_hhb = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=1),
                                   dtype=floatX)
        self.W_hyf = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nouts),
                                                   loc=0,
                                                   scale=.1),
                                   dtype=floatX)
        self.W_hyb = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nouts),
                                                   loc=0,
                                                   scale=.1),
                                   dtype=floatX)
        # sparsifying hidden weights (Ilya&Martens formula == ESN style
        # init)
        for dx in xrange(self.nhids):
            psng = self.rng.permutation(nhids)
            self.W_hhf[dx][psng[15:]] = 0.
            psng = self.rng.permutation(nhids)
            self.W_hhb[dx][psng[15:]] = 0.

        # Any spectral radius larger than .9 smaller than 1.1 should be fine
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf)))
        self.W_hhf = numpy.float32(.97 * self.W_hhf / sr)
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb)))
        self.W_hhb = numpy.float32(.97 * self.W_hhb / sr)
        self.b_hhf = numpy.zeros((nhids, ), dtype=floatX)
        self.b_hhb = numpy.zeros((nhids, ), dtype=floatX)
        self.b_hy = numpy.zeros((nouts, ), dtype=floatX)

        self.W_uhf = theano.shared(self.W_uhf, name='W_uhf')
        self.W_uhb = theano.shared(self.W_uhb, name='W_uhb')
        self.W_hhf = theano.shared(self.W_hhf, name='W_hhf')
        self.W_hhb = theano.shared(self.W_hhb, name='W_hhb')
        self.W_hyf = theano.shared(self.W_hyf, name='W_hyf')
        self.W_hyb = theano.shared(self.W_hyb, name='W_hyb')
        self.b_hhf = theano.shared(self.b_hhf, name='b_hhf')
        self.b_hhb = theano.shared(self.b_hhb, name='b_hhb')
        self.b_hy = theano.shared(self.b_hy, name='b_hy')

        self.params = [
            self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb, self.W_hyf,
            self.W_hyb, self.b_hhf, self.b_hhb, self.b_hy
        ]
        self.best_params = [(x.name, x.get_value()) for x in self.params]
        self.params_shape = [
            x.get_value(borrow=True).shape for x in self.params
        ]

        # 2. Constructing Theano graph
        # Note: new interface of scan asks the user to provide a memory
        # buffer that contains the initial state but which is also used
        # internally by scan to store the intermediate values of its
        # computations - hence the initial state is a 3D tensor
        h0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs,
                        self.nhids)
        h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs,
                        self.nhids)

        # Do we use to much memory!?
        p_hf = TT.dot(self.x.reshape(
            (self.seqlen * self.bs, self.nins)), self.W_uhf) + self.b_hhf
        p_hb = TT.dot(self.x[::-1].reshape(
            (self.seqlen * self.bs, self.nins)), self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t

        # provide sequence length !? is better on GPU
        [h_f,
         h_b], _ = scan(recurrent_fn,
                        sequences=[
                            p_hf.reshape((self.seqlen, self.bs, self.nhids)),
                            p_hb.reshape((self.seqlen, self.bs, self.nhids))
                        ],
                        states=[h0_f, h0_b],
                        n_steps=self.seqlen,
                        name='bi-RNN',
                        profile=0)
        h_b = h_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        y = TT.nnet.softmax(
            TT.dot(h_f.reshape((self.seqlen * self.bs + self.bs, self.nhids
                                )), self.W_hyf) +  # Check doc flatten
            TT.dot(h_b.reshape((self.seqlen * self.bs + self.bs,
                                self.nhids)), self.W_hyb) + self.b_hy)
        my = y.reshape((self.seqlen + 1, self.bs, self.nouts)).max(axis=0)
        nll = -TT.log(my[TT.constant(numpy.arange(self.bs)), self.t])
        self.train_cost = nll.mean()
        self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.)
        ## |-----------------------------
        # - Computing metric times a vector efficiently for p(y|x)
        # Assume softmax .. we might want sigmoids though
        self.Gyvs = lambda *args:\
            TT.Lop(y, self.params,
                   TT.Rop(y, self.params, args) /\
                   (y*numpy.array(self.bs, dtype=floatX)))
        # Computing metric times a vector effciently for p(h|x)
        if activ == TT.nnet.sigmoid:
            fn = lambda x: (1 - x) * x * numpy.array(self.bs, dtype=floatX)
        elif activ == TT.tanh:
            # Please check formula !!!! It is probably wrong
            fn = lambda x: (.5 - x / 2) * (x / 2 + .5) * numpy.array(
                self.bs, dtype=floatX)
        else:  # Assume linear or piece-wise linear activation
            fn = lambda x: numpy, array(self.bs, dtype=floatX)
        self.Ghfvs = lambda *args:\
                TT.Lop(h_f, self.params,
                       TT.Rop(h_f, self.params, args) / fn(h_f))
        self.Ghbvs = lambda *args:\
                TT.Lop(h_b, self.params,
                       TT.Rop(h_b, self.params, args) / fn(h_b))
        ## ------------------ |

        vx = TT.matrix('vx')
        vt = TT.iscalar('vt')
        vh0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1,
                         self.nhids)
        vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1,
                         self.nhids)

        # Do we use to much memory!?
        vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf
        vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t

        # provide sequence length !? is better on GPU
        [vh_f, vh_b], _ = scan(recurrent_fn,
                               sequences=[vp_hf, vp_hb],
                               states=[vh0_f, vh0_b],
                               name='valid bi-RNN',
                               n_steps=vp_hf.shape[0],
                               profile=0)
        vh_b = vh_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        vy = TT.nnet.softmax(
            TT.dot(vh_f, self.W_hyf) + TT.dot(vh_b, self.W_hyb) + self.b_hy)
        my = TT.neq(vy.max(axis=0).argmax(), vt)
        self.validate = theano.function([vx, vt],
                                        my,
                                        name='validation',
                                        profile=0)

Example #40

Show file

File: natNCG.py Project: cc13ny/galatea

    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out+eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates
        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)])
        norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)),
                           beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k),
                                         TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)),
                           beta_k)

        nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0,
             beta_k],
            updates=updates,
            allow_input_downcast = True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [ -r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function([],
                                                updates=dict(zip(self.ds +
                                                                 [self.norm_d],
                                                                 nw_ds +
                                                                 [nw_normd])),
                                                name='reset_dirs',
                                                on_unused_input='warn',
                                                mode=cpu_mode,
                                                allow_input_downcast=True,
                                                profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function(
            [lr, ebdx],
            fcost,
            givens = grad_inps,
            allow_input_downcast=True,
            name='ls_cost_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.approx_change = theano.function(
                [lr],
                -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]),
                allow_input_downcast=True,
                name='approx_change',
                mode=gpu_mode,
                profile=options['profile'])


        self.ls_grad_fn = theano.function(
            [lr, ebdx],
            fgrad,
            allow_input_downcast=True,
            givens = grad_inps,
            name='ls_grad_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=cpu_mode,
                           allow_input_downcast=True,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0

Example #41

Show file

File: SGD.py Project: vd114/galatea

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        self.model = model
        # push dataset into shared var
        n_params = len(model.params)
        xdata = theano.shared(data['train_x'].astype('float32'), name='xdata')
        # ! This works for 1 of k classification
        ydata = TT.cast(
            theano.shared(data['train_y'].astype('float32'), name='ydata'),
            'int32')

        shared_data = [xdata, ydata]
        self.xdata = xdata
        self.ydata = ydata
        # all sorts of indices
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # vars for gradients
        # Store Euclidean gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients (H^-1*g)
        self.rs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            on_unused_input='warn',
            name='compute_eucledian_gradients',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        update_vals = dict(zip(model.params, nw_ps))
        #updates.update(dict(zip(model.params, nw_ps)))
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       updates=updates,
                                       on_unused_input='warn',
                                       name='eval_fn',
                                       mode=theano.Mode(linker='cvm'),
                                       profile=options['profile'])
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_vals,
            on_unused_input='warn',
            #givens=dict(grad_inps),
            name='update_params',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6

        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc, acc_train_cost):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            train_cost = TT.cast(safe_clone(model.train_cost, replace=replace),
                                 'float32')
            return [
                _idx + const(1), acc + nw_cost, acc_train_cost + train_cost
            ]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=theano.Mode(linker='cvm'),
                        profile=options['profile'])

        ferr = rvals[1][0] / const(n_steps)
        ftrain_cost = rvals[2][0] / const(n_steps)

        self.compute_error = theano.function([ebdx], [ferr, ftrain_cost],
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             on_unused_input='warn',
                                             mode=theano.Mode(linker='cvm'),
                                             profile=options['profile'])

Example #42

Show file

File: birnn.py Project: LeonBai/lisa_emotiw-1

    def __init__(self,
                 nhids =50,
                 nouts = 8,
                 nins = 2,
                 activ = TT.nnet.sigmoid,
                 seed = 234,
                 bs = 16, # batchsize
                 seqlen = 3 # sequence length - fixed during training
                ):
        # 0. Keep track of arguments
        self.bs = bs
        self.nhids = nhids
        self.nouts = nouts
        self.nins = nins
        self.activ = activ
        self.seed = seed
        self.bs = bs
        self.seqlen = seqlen
        floatX = theano.config.floatX
        self.rng = numpy.random.RandomState(seed)

        # 1. Generating Theano variables
        # DenseSequence space
        # We store data as 3D tensor with (time, batch-size, nfeatures)
        self.x = TT.tensor3('x')
        # IndexSequence space
        # We store data as 1D tensor where each the dimension goes over the
        # batch size (i.e. target of each sequence in the batch)
        self.t = TT.ivector('t') # target index for each element of batchsize
        self.inputs = [self.x, self.t]
        # Naming convention for letters after the `_`:
        # u - input
        # h - hidden
        # y - output
        # f - forward
        # b - backwards

        self.W_uhf = numpy.asarray(
            self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01),
            dtype=floatX)
        self.W_uhb = numpy.asarray(
            self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01),
            dtype=floatX)
        self.W_hhf = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1),
            dtype=floatX)
        self.W_hhb = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1),
            dtype=floatX)
        self.W_hyf = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1),
            dtype=floatX)
        self.W_hyb = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1),
            dtype=floatX)
        # sparsifying hidden weights (Ilya&Martens formula == ESN style
        # init)
        for dx in xrange(self.nhids):
            psng = self.rng.permutation(nhids)
            self.W_hhf[dx][psng[15:]] = 0.
            psng = self.rng.permutation(nhids)
            self.W_hhb[dx][psng[15:]] = 0.

        # Any spectral radius larger than .9 smaller than 1.1 should be fine
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf)))
        self.W_hhf = numpy.float32(.97*self.W_hhf/sr)
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb)))
        self.W_hhb = numpy.float32(.97*self.W_hhb/sr)
        self.b_hhf = numpy.zeros((nhids,), dtype=floatX)
        self.b_hhb = numpy.zeros((nhids,), dtype=floatX)
        self.b_hy = numpy.zeros((nouts,), dtype=floatX)

        self.W_uhf = theano.shared(self.W_uhf, name='W_uhf')
        self.W_uhb = theano.shared(self.W_uhb, name='W_uhb')
        self.W_hhf = theano.shared(self.W_hhf, name='W_hhf')
        self.W_hhb = theano.shared(self.W_hhb, name='W_hhb')
        self.W_hyf = theano.shared(self.W_hyf, name='W_hyf')
        self.W_hyb = theano.shared(self.W_hyb, name='W_hyb')
        self.b_hhf = theano.shared(self.b_hhf, name='b_hhf')
        self.b_hhb = theano.shared(self.b_hhb, name='b_hhb')
        self.b_hy = theano.shared(self.b_hy, name='b_hy')

        self.params = [self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb,
                       self.W_hyf, self.W_hyb, self.b_hhf, self.b_hhb,
                       self.b_hy]
        self.best_params = [(x.name, x.get_value()) for x in self.params]
        self.params_shape = [x.get_value(borrow=True).shape for x in
                             self.params]

        # 2. Constructing Theano graph
        # Note: new interface of scan asks the user to provide a memory
        # buffer that contains the initial state but which is also used
        # internally by scan to store the intermediate values of its
        # computations - hence the initial state is a 3D tensor
        h0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.bs,
                              self.nhids)
        h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.bs,
                               self.nhids)

        # Do we use to much memory!?
        p_hf = TT.dot(self.x.reshape((self.seqlen*self.bs, self.nins)), self.W_uhf) + self.b_hhf
        p_hb = TT.dot(self.x[::-1].reshape((self.seqlen*self.bs, self.nins)), self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t
        # provide sequence length !? is better on GPU
        [h_f, h_b], _ = scan(
            recurrent_fn,
            sequences = [
                p_hf.reshape((self.seqlen, self.bs, self.nhids)),
                p_hb.reshape((self.seqlen, self.bs, self.nhids))],
            states = [h0_f, h0_b],
            n_steps = self.seqlen,
            name = 'bi-RNN',
            profile = 0)
        h_b = h_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        y = TT.nnet.softmax(
            TT.dot(h_f.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyf) + # Check doc flatten
            TT.dot(h_b.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyb) +
            self.b_hy)
        my = y.reshape((self.seqlen+1, self.bs, self.nouts)).max(axis=0)
        nll = -TT.log(
            my[TT.constant(numpy.arange(self.bs)), self.t])
        self.train_cost = nll.mean()
        self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.)
        ## |-----------------------------
        # - Computing metric times a vector efficiently for p(y|x)
        # Assume softmax .. we might want sigmoids though
        self.Gyvs = lambda *args:\
            TT.Lop(y, self.params,
                   TT.Rop(y, self.params, args) /\
                   (y*numpy.array(self.bs, dtype=floatX)))
        # Computing metric times a vector effciently for p(h|x)
        if activ == TT.nnet.sigmoid:
            fn = lambda x : (1-x)*x*numpy.array(self.bs, dtype=floatX)
        elif activ == TT.tanh:
            # Please check formula !!!! It is probably wrong
            fn = lambda x:(.5-x/2)*(x/2+.5)*numpy.array(self.bs,
                                                        dtype=floatX)
        else: # Assume linear or piece-wise linear activation
            fn = lambda x: numpy,array(self.bs, dtype=floatX)
        self.Ghfvs = lambda *args:\
                TT.Lop(h_f, self.params,
                       TT.Rop(h_f, self.params, args) / fn(h_f))
        self.Ghbvs = lambda *args:\
                TT.Lop(h_b, self.params,
                       TT.Rop(h_b, self.params, args) / fn(h_b))
        ## ------------------ |

        vx = TT.matrix('vx')
        vt = TT.iscalar('vt')
        vh0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.nhids)
        vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.nhids)

        # Do we use to much memory!?
        vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf
        vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t
        # provide sequence length !? is better on GPU
        [vh_f, vh_b], _ = scan(
            recurrent_fn,
            sequences = [vp_hf, vp_hb],
            states = [vh0_f, vh0_b],
            name = 'valid bi-RNN',
            n_steps = vp_hf.shape[0],
            profile = 0)
        vh_b = vh_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        vy = TT.nnet.softmax(
            TT.dot(vh_f, self.W_hyf) +
            TT.dot(vh_b, self.W_hyb) +
            self.b_hy)
        my = TT.neq(vy.max(axis=0).argmax(), vt)
        self.validate = theano.function([vx, vt], my,
                                        name='validation',
                                        profile=0)

Example #43

Show file

def minres(compute_Av,
           bs,
           rtol=numpy.float32(1e-6),
           maxit=20,
           Ms=None,
           shift=numpy.float32(0.),
           maxxnorm=numpy.float32(1e15),
           Acondlim=numpy.float32(1e16),
           mode=None,
           profile=0):
    """
     DESCRIPTION:
         minres attempts to find the minimum-length and minimum-residual-norm
         solution x to the system of linear equations A*x = b or
         least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
         must be symmetric (but need not be positive definite or invertible).
         The right-hand-side column vector b must have length n.

     INPUTS:
        :param compute_Av: callable returing the symbolic expression for
            `Av`. `v` can be a set of parameteres
        :param bs: list of Theano expressions. We are looking to compute
            A^-1\dot bs
        :param rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        :param maxit: Optional, positive integer, specifies the maximum number of
            iterations. Default is 20
        :param Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        :param shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.
        p1, p2,... Optional, inputs to A and M if they are functions

     OUTPUTS:
        x       n-vector, estimated solution
        flag    integer, convergence flag
               -1  beta2 = 0.  If M = I, b and x are eigenvectors.
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9 It is a least squares problem but no converged solution yet.
        iter    integer, iteration number at which x was computed: 0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

    EXAMPLE 1:
         n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n);
         b = sum(A,2); rtol = 1e-10; maxit = 50; M = spdiags(4*on,0,n,n);
         x = minresSOL69(A, b, rtol, maxit, M);

         Use this matrix-vector product function
            function y = afun(x,n)
            y = 4 * x;
            y(2:n) = y(2:n) - 2 * x(1:n-1);
            y(1:n-1) = y(1:n-1) - 2 * x(2:n);
         as input to minresSOL69
            x1 = minresSOL69(@afun, b, rtol, maxit, M);

     EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite.
          n = 50; N = n^2; on=ones(n,1);   B = spdiags([on on on], -1:1, n, n);
          A = sparse([],[],[],N,N,(3*n-2)^2);
          for i=1:n
              A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B;
              if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end;
              if (i-2)*n+1 > 0  A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B;  end;
          end
          b = sum(A,2);   rtol = 1e-5;   maxxnorm = 1e2;
          shift = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show);

     EXAMPLE 3: A is diagonal, singular and indefinite.
          h = 1;  a = -10; b = -a; n = 2*b/h + 1;
          A = spdiags((a:h:b)', 0, n, n);
          b = ones(n,1);   rtol = 1e-6;   maxxnorm = 1e2;
          shift = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show);



     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = numpy.float32(1e-23)

    # Initialise
    flag = theano.shared(numpy.float32(0.))
    beta1 = norm(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = norm(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)
        r3s = [r3 + shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, numpy.float64(1.)),
                      r3 - (beta / betal) * r1, r3)
            for r3, r1 in zip(r3s, r1s)
        ]

        alpha = sqnorm(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = norm(r2s, r3s)
        else:
            betan = norm(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, numpy.float32(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, numpy.float32(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, numpy.float32(0.)), norm(ds),
                           TT.constant((numpy.float32(numpy.inf))))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = norm(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), numpy.float32(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, numpy.float32(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, norm(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, numpy.float32(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = numpy.float32(1) + relrnorm
        t2 = numpy.float32(1) + relArnorml
        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, numpy.float32(0.)),
                          TT.eq(flag, numpy.float32(6.))),
            TT.switch(
                TT.le(t1, numpy.float32(1.)), numpy.float32(3.),
                TT.switch(
                    TT.le(t2, numpy.float32(1.)), numpy.float32(4.),
                    TT.switch(
                        TT.le(relrnorm, rtol), numpy.float32(1.),
                        TT.switch(
                            TT.le(Anorm, numpy.float32(1e-20)),
                            numpy.float32(12),
                            TT.switch(
                                TT.le(relArnorml, rtol), numpy.float32(10.),
                                TT.switch(
                                    TT.ge(epsx, beta1), numpy.float32(5.),
                                    TT.switch(
                                        TT.ge(xnorm, maxxnorm),
                                        numpy.float32(6.),
                                        TT.switch(
                                            TT.ge(niter,
                                                  TT.cast(maxit, 'float32')),
                                            numpy.float32(8.), flag)))))))),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm),
                         numpy.float32(11.), flag)
        return [
            niter + numpy.float32(1.),
            beta,
            betan,
            phi,
            Acond,
            cs,
            dbarn,
            eplnn,
            rnorm,
            sn,
            Tnorm,
            rnorml,
            xnorm,
            Dnorm,
            gamma,
            pnorm,
            gammal,
            Axnorm,
            relrnorm,
            relArnorml,
            Anorm,
            flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag,0))

    states = []
    # 0 niter
    states.append(TT.constant(numpy.float32([0])))
    # 1 beta
    states.append(TT.constant(numpy.float32([0])))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(TT.constant(numpy.float32([1])))
    # 5 cs
    states.append(TT.constant(numpy.float32([-1])))
    # 6 dbarn
    states.append(TT.constant(numpy.float32([0])))
    # 7 eplnn
    states.append(TT.constant(numpy.float32([0])))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(TT.constant(numpy.float32([0])))
    # 10 Tnorm
    states.append(TT.constant(numpy.float32([0])))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(TT.constant(numpy.float32([0])))
    # 13 Dnorm
    states.append(TT.constant(numpy.float32([0])))
    # 14 gamma
    states.append(TT.constant(numpy.float32([0])))
    # 15 pnorm
    states.append(TT.constant(numpy.float32([0])))
    # 16 gammal
    states.append(TT.constant(numpy.float32([0])))
    # 17 Axnorm
    states.append(TT.constant(numpy.float32([0])))
    # 18 relrnorm
    states.append(TT.constant(numpy.float32([1])))
    # 19 relArnorml
    states.append(TT.constant(numpy.float32([1])))
    # 20 Anorm
    states.append(TT.constant(numpy.float32([0])))
    # 21 flag
    states.append(TT.constant(numpy.float32([0])))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, lupds = scan(loop,
                        states=states + xs + r1s + r2s + r3s + dls + ds,
                        n_steps=maxit + numpy.int32(1),
                        name='minres',
                        profile=profile,
                        mode=mode)

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22 + n_params]]
    return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm, lupds

Example #44

Show file

File: linesearch.py Project: EderSantana/pylearn2

def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile=False):
    """
    WRITEME

    Part of the optimization algorithm in `scalar_search_wolfe2`.

    Parameters
    ----------
    a_lo : float
        Step size
    a_hi : float
        Step size
    phi_lo : float
        Value of f at a_lo
    phi_hi : float
        Value of f at a_hi
    derphi_lo : float
        Value of derivative at a_lo
    phi : callable
        Generates computational graph
    derphi : callable
        Generates computational graph
    phi0 : float
        Value of f at 0
    derphi0 : float
        Value of the derivative at 0
    c1 : float
        Wolfe parameter
    c2 : float
        Wolfe parameter
    profile : bool
        True if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi - a_lo
        a = TT.switch(dalpha < zero, a_hi, a_lo)
        b = TT.switch(dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1 * dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo,
                              a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2 * dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',
                         TT.isnan(a_j_quad),
                         a_j_quad > b - qchk,
                         a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX) * \
                             dalpha, a_j_quad)

        # pick between the two ..
        cond_c = lazy_or('condc',
                         TT.isnan(a_j_cubic),
                         TT.bitwise_or(a_j_cubic > b - cchk,
                                       a_j_cubic < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop',
                        TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                       phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2 * derphi0)

        cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj * (a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse(cond1,
                         phi_hi,
                         TT.switch(cond2, phi_hi, phi_lo),
                         name='phi_rec')
        a_rec = ifelse(cond1,
                       a_hi,
                       TT.switch(cond2, a_hi, a_lo),
                         name='a_rec')
        a_hi = ifelse(cond1, a_j,
                      TT.switch(cond2, a_lo, a_hi),
                      name='a_hi')
        phi_hi = ifelse(cond1, phi_aj,
                        TT.switch(cond2, phi_lo, phi_hi),
                        name='phi_hi')

        a_lo = TT.switch(cond1, a_lo, a_j)
        phi_lo = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan,
                          TT.switch(cond2, derphi_aj, nan), name='valprime')

        return ([phi_rec,
                 a_rec,
                 a_lo,
                 a_hi,
                 phi_hi,
                 phi_lo,
                 derphi_lo,
                 a_star,
                 val_star,
                 valprime],
                theano.scan_module.scan_utils.until(stop))

    maxiter = n_iters
    # cubic interpolant check
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))
    # quadratic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi - a_lo
    a = TT.switch(dalpha < zero, a_hi, a_lo)
    b = TT.switch(dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection

    # quadric interpolation
    qchk = delta2 * dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',
                     TT.isnan(a_j),
                     TT.bitwise_or(a_j > b - qchk,
                                   a_j < a + qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX) * \
                    dalpha, a_j)

    # Check new value of a_j
    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)

    cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj * (a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse(cond1,
                     phi_hi,
                     TT.switch(cond2, phi_hi, phi_lo),
                     name='mphirec')
    a_rec = ifelse(cond1,
                   a_hi,
                   TT.switch(cond2, a_hi, a_lo),
                   name='marec')
    a_hi = ifelse(cond1,
                  a_j,
                  TT.switch(cond2, a_lo, a_hi),
                  name='mahi')
    phi_hi = ifelse(cond1,
                    phi_aj,
                    TT.switch(cond2, phi_lo, phi_hi),
                    name='mphihi')

    onlyif = lazy_and('only_if',
                      TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                     phi_aj < phi_lo),
                      abs(derphi_aj) <= -c2 * derphi0)

    a_lo = TT.switch(cond1, a_lo, a_j)
    phi_lo = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # print'while_zoom'
    outs, updates = scan(while_zoom,
                         states=states,
                         n_steps=maxiter,
                         name='while_zoom',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while'
    a_star = ifelse(onlyif, a_j, outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime

Example #45

Show file

File: SGD.py Project: cc13ny/galatea

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        self.model = model
        # push dataset into shared var
        n_params = len(model.params)
        xdata = theano.shared(data['train_x'].astype('float32'),
                              name='xdata')
        # ! This works for 1 of k classification
        ydata = TT.cast(
            theano.shared(data['train_y'].astype('float32'),
                          name='ydata'), 'int32')

        shared_data = [xdata, ydata]
        self.xdata = xdata
        self.ydata = ydata
        # all sorts of indices
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # vars for gradients
        # Store Euclidean gradients
        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store riemannian gradients (H^-1*g)
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            on_unused_input='warn',
            name='compute_eucledian_gradients',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        update_vals = dict(zip(model.params, nw_ps))
        #updates.update(dict(zip(model.params, nw_ps)))
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]

        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            final_cost,
            givens=dict(grad_inps),
            updates= updates,
            on_unused_input='warn',
            name='eval_fn',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_vals,
            on_unused_input='warn',
            #givens=dict(grad_inps),
            name='update_params',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6

        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc, acc_train_cost):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err,
                                         replace=replace),'float32')
            train_cost = TT.cast(safe_clone(model.train_cost,
                                          replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost,
                    acc_train_cost + train_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=theano.Mode(linker='cvm'),
                        profile = options['profile'])

        ferr = rvals[1][0] / const(n_steps)
        ftrain_cost = rvals[2][0] / const(n_steps)

        self.compute_error = theano.function([ebdx],
                           [ferr, ftrain_cost],
                           givens=dict(grad_inps),
                           name='compute_err',
                           on_unused_input='warn',
                           mode=theano.Mode(linker='cvm'),
                           profile=options['profile'])

Example #46

Show file

def scalar_search_wolfe2(phi, derphi, phi0=None,
                         old_phi0=None, derphi0=None,
                         n_iters = 20,
                         c1=1e-4, c2=0.9,
                         mode=theano.Mode(linker='cvm'),
                        profile = False):
    """Find alpha that satisfies strong Wolfe conditions.

    alpha > 0 is assumed to be a descent direction.

    Parameters
    ----------
    phi : callable f(x)
        Objective scalar function.

    derphi : callable f'(x)
        Objective function derivative (can be None)
    phi0 : float, optional
        Value of phi at s=0
    old_phi0 : float, optional
        Value of phi at previous point
    derphi0 : float, optional
        Value of derphi at s=0
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
    profile : flag (boolean)
        True if you want printouts of profiling information

    Returns
    -------
    alpha_star : float
        Best alpha
    phi_star
        phi at alpha_star
    phi0
        phi at 0
    derphi_star
        derphi at alpha_star

    Notes
    -----
    Uses the line search algorithm to enforce strong Wolfe
    conditions.  See Wright and Nocedal, 'Numerical Optimization',
    1999, pg. 59-60.

    For the zoom phase it uses an algorithm by [...].

    """

    if phi0 is None:
        phi0 = phi(zero)
    else:
        phi0 = phi0
    if derphi0 is None and derphi is not None:
        derphi0 = derphi(zero)
    else:
        derphi0 = derphi0

    alpha0 = zero
    alpha0.name ='alpha0'
    if old_phi0 is not None:
        alpha1 = TT.minimum(one, numpy.asarray(1.01,
                                               dtype=theano.config.floatX)* \
                            numpy.asarray(2, dtype=theano.config.floatX)*(phi0 - old_phi0)/derphi0)
    else:
        old_phi0 = nan
        alpha1   = one

    alpha1 = TT.switch(alpha1 < zero, one, alpha1)
    alpha1.name = 'alpha1'

    # This shouldn't happen. Perhaps the increment has slipped below
    # machine precision?  For now, set the return variables skip the
    # useless while loop, and raise warnflag=2 due to possible imprecision.
    phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0)
    # I need a lazyif for alpha1 == 0 !!!
    phi_a1 = ifelse(TT.eq(alpha1,zero), phi0,
                    phi(alpha1), name='phi_a1')
    phi_a1.name = 'phi_a1'
    phi_a0 = phi0
    phi_a0.name = 'phi_a0'
    derphi_a0 = derphi0
    derphi_a0.name = 'derphi_a0'
    # Make sure variables are tensors otherwise strange things happen
    c1 = TT.as_tensor_variable(c1)
    c2 = TT.as_tensor_variable(c2)
    maxiter = n_iters
    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2*derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name = 'alphastar_c3'),
                      name = 'alphastar_c2'),
                      name ='alphastar_c1')

        return ( [alpha1,
                  nw_alpha1,
                  phi_a1,
                  ifelse(lazy_or('allconds',cond1, cond2, cond3),
                         phi_a1, nw_phi, name='nwphi1'),
                  ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                  i_t + one,
                  alpha_star,
                  phi_star,
                  derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3)))
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0),0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1),0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)]
    # i_t
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # alpha_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # phi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # derphi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    print 'while_search'
    outs, updates = scan(while_search,
                         states = states,
                         n_steps = maxiter,
                         name = 'while_search',
                         mode = mode,
                         profile = profile)
    print 'done_while_search'
    out3 = outs[-3][0]
    out2 = outs[-2][0]
    out1 = outs[-1][0]
    alpha_star, phi_star, derphi_star = \
            ifelse(TT.eq(alpha1, zero),
                        ( nan,phi0, nan),
                        ( out3, out2, out1), name = 'main_alphastar')
    return alpha_star, phi_star,  phi0, derphi_star

Example #47

Show file

File: minres.py Project: poolio/pylearn

def minres(compute_Av,
           bs,
           rtol=constantX(1e-6),
           maxit=20,
           Ms=None,
           shift=constantX(0.),
           maxxnorm=constantX(1e15),
           Acondlim=constantX(1e16),
           profile=0):
    """
     minres attempts to find the minimum-length and minimum-residual-norm
     solution x to the system of linear equations A*x = b or
     least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
     must be symmetric (but need not be positive definite or invertible).
     The right-hand-side column vector b must have length n.

     Parameters:

        compute_Av: callable returing the symbolic expression for
            `Av` (the product of matrix A with some vector v).
            `v` should be a list of tensors, whre the vector v means
            the vector obtain by concatenating and flattening all tensors in
            v
        bs: list of Theano expressions. We are looking to compute
            `A^-1\dot bs`.
        rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        maxit: Optional, positive integer, specifies the maximum number
            of iterations. Default is 20
        Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.

     OUTPUTS:
        x       list of Theano tensor representing the solution
        flag    theano int scalar - convergence flag
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9/10 It is a least squares problem but no converged
                 solution yet.
        iter    integer, iteration number at which x was computed:
                0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = constantX(1e-23)

    # Initialise
    flag = theano.shared(constantX(0.))
    beta1 = sqrt_inner_product(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = sqrt_inner_product(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, constantX(1.)),
                         r3 - (beta / betal) * r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, constantX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, constantX(0.)),
                        (v - epln * dl2 - dlta * dl) / gamma,
                        v)
              for v, dl2, dl in zip(vs, dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds),
                           constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2, x)
              for dl2, x in zip(dl2s, xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, constantX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)),
                          TT.eq(flag, constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)),
                            constantX(3),
                            TT.le(t2, constantX(1)),
                            constantX(4),
                            TT.le(relrnorm, rtol),
                            constantX(1),
                            TT.le(Anorm, constantX(1e-20)),
                            constantX(12),
                            TT.le(relArnorml, rtol),
                            constantX(10),
                            TT.ge(epsx, beta1),
                            constantX(5),
                            TT.ge(xnorm, maxxnorm),
                            constantX(6),
                            TT.ge(niter, TT.cast(maxit,
                                                 theano.config.floatX)),
                            constantX(8),
                            flag),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm),
                         constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))

    states = []
    # 0 niter
    states.append(constantX([0]))
    # 1 beta
    states.append(constantX([0]))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(constantX([1]))
    # 5 cs
    states.append(constantX([-1]))
    # 6 dbarn
    states.append(constantX([0]))
    # 7 eplnn
    states.append(constantX([0]))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(constantX([0]))
    # 10 Tnorm
    states.append(constantX([0]))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(constantX([0]))
    # 13 Dnorm
    states.append(constantX([0]))
    # 14 gamma
    states.append(constantX([0]))
    # 15 pnorm
    states.append(constantX([0]))
    # 16 gammal
    states.append(constantX([0]))
    # 17 Axnorm
    states.append(constantX([0]))
    # 18 relrnorm
    states.append(constantX([1]))
    # 19 relArnorml
    states.append(constantX([1]))
    # 20 Anorm
    states.append(constantX([0]))
    # 21 flag
    states.append(constantX([0]))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, loc_updates = scan(
        loop,
        states=states + xs + r1s + r2s + r3s + dls + ds,
        n_steps=maxit + numpy.int32(1),
        name='minres',
        profile=profile,
        mode=theano.Mode(linker='cvm'))
    assert isinstance(loc_updates, dict) and 'Ordered' in str(type(loc_updates))

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22: 22 + n_params]]
    return (sol,
            flag,
            niters,
            relres,
            relAres,
            Anorm,
            Acond,
            xnorm,
            Axnorm,
            loc_updates)

Example #48

Show file

def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(
        rng,
        n_in=state['n_in'],
        n_hids=eval(state['inp_nhids']),
        activation=eval(state['inp_activ']),
        init_fn='sample_weights_classic',
        weight_noise=state['weight_noise'],
        rank_n_approx = state['rank_n_approx'],
        scale=state['inp_scale'],
        sparsity=state['inp_sparse'],
        learn_bias = True,
        bias_scale=eval(state['inp_bias']),
        name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
            rng,
            eval(state['nhids']),
            activation = eval(state['rec_activ']),
            #activation = 'TT.nnet.sigmoid',
            bias_scale = eval(state['rec_bias']),
            scale=eval(state['rec_scale']),
            sparsity=eval(state['rec_sparse']),
            init_fn=eval(state['rec_init']),
            weight_noise=state['weight_noise'],
            name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb, n_steps=x.shape[0],
                    init_state=h0*reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### TODO: Define a layer from the hidden state to the intermediate layer

    #### Exercise (1)
    #### TODO: Define a layer from the input to the intermediate Layer

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    #### TODO: Define an activation layer

    #### Exercise (2)
    #### TODO: Define a dropout layer

    #### Softmax Layer
    output_layer = SoftmaxLayer(
        rng,
        eval(state['dout_nhid']),
        state['n_out'],
        scale=state['out_scale'],
        bias_scale=state['out_bias_scale'],
        init_fn="sample_weights_classic",
        weight_noise=state['weight_noise'],
        sparsity=state['out_sparse'],
        sum_over_time=True,
        name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(
            rng,
            n_in=state['n_in'],
            n_hids=eval(state['inpout_nhids']),
            activations=eval(state['inpout_activ']),
            init_fn='sample_weights_classic',
            weight_noise = state['weight_noise'],
            scale=eval(state['inpout_scale']),
            sparsity=eval(state['inpout_sparse']),
            learn_bias=eval(state['inpout_learn_bias']),
            bias_scale=eval(state['inpout_bias']),
            name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']
    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr']/(1+time/obj.state['lr_beta'])
            obj.lr = new_lr
    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(target=y,
            scale=numpy.float32(1./state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps = x.shape[0],
                    batch_size=1,
                    init_state=h0val*reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]

    ##### Exercise (1):
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer without noise

    if state['shortcut_inpout']:
        additional_inputs=[rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs=[rec_layer]
    valid_model = output_layer(outhid,
            additional_inputs=additional_inputs,
            use_noise=False).validate(target=y, sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x,y, reset], valid_model.out,
          name='valid_fn', updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise = False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), 
            use_noise=False, one_step=True)
        word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                      states = [
                          TT.alloc(numpy.int64(0), state['sample_steps']),
                          TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])],
                      n_steps= state['sample_steps'],
                      name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
        updates=updates, profile=False, name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']
    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(
        cost_layer = train_model,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        clean_before_noise_fn = False,
        noise_fn = None,
        rng = rng)

    if state['reload']:
        model.load(state['prefix']+'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost = False,
                    hooks = hook_fn,
                    validate_postprocess =  eval(state['validate_postprocess']))
    ## Run!
    main.main()

Example #49

Show file

File: minres.py Project: gdesjardins/pylearn2_bio

def minres(compute_Av,
           bs,
           rtol=constantX(1e-6),
           maxit=20,
           Ms=None,
           shift=constantX(0.),
           maxxnorm=constantX(1e15),
           Acondlim=constantX(1e16),
           profile=0):
    """
     minres attempts to find the minimum-length and minimum-residual-norm
     solution x to the system of linear equations A*x = b or
     least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
     must be symmetric (but need not be positive definite or invertible).
     The right-hand-side column vector b must have length n.

     Parameters:

        compute_Av: callable returing the symbolic expression for
            `Av` (the product of matrix A with some vector v).
            `v` should be a list of tensors, whre the vector v means
            the vector obtain by concatenating and flattening all tensors in
            v
        bs: list of Theano expressions. We are looking to compute
            `A^-1\dot bs`.
        rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        maxit: Optional, positive integer, specifies the maximum number
            of iterations. Default is 20
        Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.

     OUTPUTS:
        x       list of Theano tensor representing the solution
        flag    theano int scalar - convergence flag
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9/10 It is a least squares problem but no converged
                 solution yet.
        iter    integer, iteration number at which x was computed:
                0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = constantX(1e-23)

    # Initialise
    beta1 = sqrt_inner_product(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = sqrt_inner_product(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1,
                      r3) for r3, r1 in zip(r3s, r1s)
        ]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, constantX(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, constantX(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds), constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, constantX(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag,
                                                           constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)), constantX(3),
                            TT.le(t2, constantX(1)), constantX(4),
                            TT.le(relrnorm, rtol), constantX(1),
                            TT.le(Anorm, constantX(1e-20)), constantX(12),
                            TT.le(relArnorml, rtol), constantX(10),
                            TT.ge(epsx, beta1), constantX(5),
                            TT.ge(xnorm, maxxnorm), constantX(6),
                            TT.ge(niter, TT.cast(maxit, theano.config.floatX)),
                            constantX(8), flag), flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))

    states = []
    # 0 niter
    states.append(constantX([0]))
    # 1 beta
    states.append(constantX([0]))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(constantX([1]))
    # 5 cs
    states.append(constantX([-1]))
    # 6 dbarn
    states.append(constantX([0]))
    # 7 eplnn
    states.append(constantX([0]))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(constantX([0]))
    # 10 Tnorm
    states.append(constantX([0]))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(constantX([0]))
    # 13 Dnorm
    states.append(constantX([0]))
    # 14 gamma
    states.append(constantX([0]))
    # 15 pnorm
    states.append(constantX([0]))
    # 16 gammal
    states.append(constantX([0]))
    # 17 Axnorm
    states.append(constantX([0]))
    # 18 relrnorm
    states.append(constantX([1]))
    # 19 relArnorml
    states.append(constantX([1]))
    # 20 Anorm
    states.append(constantX([0]))
    # 21 flag
    states.append(constantX([0]))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, loc_updates = scan(loop,
                              states=states + xs + r1s + r2s + r3s + dls + ds,
                              n_steps=maxit + numpy.int32(1),
                              name='minres',
                              profile=profile,
                              mode=theano.Mode(linker='cvm'))
    assert isinstance(loc_updates, dict) and 'Ordered' in str(
        type(loc_updates))

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22 + n_params]]
    return (sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm,
            loc_updates)

Example #50

Show file

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients
        self.rs1 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        self.rs2 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store jacobi diagonal
        self.js = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape, nstreams=128))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        self.damping = theano.shared(numpy.float32(options['mreg']))
        # Step 2.1 Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gf_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gf_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs1, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients1 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 2.2 Compile function for Computing Riemannian gradients
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gc_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gc_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs2, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients2 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        if options['rsch'] == 1:
            self.rs = self.rs1
        else:
            self.rs = self.rs2

        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]
        denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        self.approx_change = theano.function([lr],
                                             denom,
                                             name='approx_change',
                                             mode=gpu_mode,
                                             allow_input_downcast=True,
                                             profile=options['profile'])

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       allow_input_downcast=True,
                                       mode=gpu_mode,
                                       profile=options['profile'])

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        self.grad_lr_fn = theano.function([ebdx, lr],
                                          fgrad,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          on_unused_input='warn',
                                          mode=gpu_mode,
                                          allow_input_downcast=True,
                                          profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])

Example #51

Show file

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the krylov
                    subspace
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lbfgsIters' -> int
                `krylovDim` -> int
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        rng = numpy.random.RandomState(options['seed'])
        self.rng = rng
        self.options = options
        self.channel = channel
        self.model = model
        n_dimensions = options['krylovDim']
        self.n_dimensions = n_dimensions
        if options['device'] == 'gpu':
            cfn_subspaces = \
                [theano.shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [theano.shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            cfn_subspaces = \
                [TT._shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [TT._shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        self.cfn_subspaces = cfn_subspaces
        self.old_deltas = old_deltas

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        print 'Constructing grad function'
        loc_inputs = [x.type(name='locx') for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs, [
            x[gdx * options['gbs']:(gdx + 1) * options['gbs']]
            for x in shared_data
        ])
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               cgv))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        rvals, updates = krylov_subspace(compute_Gv,
                                         self.gs,
                                         old_deltas,
                                         n_dimensions,
                                         model.params_shape,
                                         profile=options['profile'],
                                         device=options['device'])

        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs, [
            x[gdx * options['mbs']:(gdx + 1) * options['mbs']]
            for x in shared_data
        ])
        updates.update(dict(zip(cfn_subspaces, rvals)))
        self.update_krylov_subspace = theano.function(
            [gdx], [],
            updates=updates,
            givens=dict(grad_inps),
            profile=options['profile'],
            on_unused_input='warn',
            name='update_krylov_subspace',
            mode=mode)

        alphas = tensor.vector('alphas')
        deltas = []
        nw_params = []
        if options['device'] == 'gpu':
            params = model.params
        else:
            params = model.cpu_params

        for param, subspace in zip(params, cfn_subspaces):
            alpha_reshuffle = [0] + ['x'] * param.ndim
            delta = (alphas.dimshuffle(*alpha_reshuffle) * \
                        subspace).sum(axis=0)
            nw_param = param + delta
            nw_params.append(nw_param)
            deltas.append(delta)

        print 'constructing evaluation function'
        ebdx = TT.iscalar('ebdx')

        updates_dict = dict(zip(model.params + old_deltas, nw_params + deltas))
        if options['device'] != 'gpu':
            updates_dict.update(dict(zip(model.cpu_params, nw_params)))

        self.update_params = theano.function([alphas],
                                             updates=updates_dict,
                                             name='update_params',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, alphas)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.zeros((1, n_dimensions), dtype='float32'))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        mode=gpu_mode,
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)

        grad_inps = zip(loc_inputs, [
            x[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]
            for x in shared_data
        ])
        self.lbfgs_fn = theano.function(
            [alphas, ebdx],
            #theano.printing.Print('fcost')(fcost),
            fcost,
            givens=grad_inps,
            allow_input_downcast=True,
            on_unused_input='warn',
            name='lbfgs_fn',
            profile=options['profile'],
            mode=gpu_mode)
        self.lbfgs_grad = theano.function([alphas, ebdx],
                                          fgrad,
                                          givens=grad_inps,
                                          on_unused_input='warn',
                                          allow_input_downcast=True,
                                          name='lbfgs_grad',
                                          profile=options['profile'],
                                          mode=gpu_mode)

        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                                             ferr,
                                             givens=dict(
                                                 zip(loc_inputs, shared_data)),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])

Example #52

Show file

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            # Store eucledian gradients
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              rtol=options['mrtol'],
                              shift=-options['mreg'],
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])

Example #53

Show file

File: natSGD_jacobi.py Project: cc13ny/galatea

    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'],
                              name='xdata')
        ydata = theano.shared(data['train_y'],
                           name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store riemannian gradients
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store jacobi diagonal
        self.js = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]


        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        self.damping = theano.shared(numpy.float32(options['mreg']))
        mode=gpu_mode
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates


        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift= self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc0,acc1):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_cost2 = safe_clone(model.train_cost, replace =
                                  dict(zip(model.inputs, nw_inps)))
            return [_idx + const(1),
                    acc0 + nw_cost,
                    acc1 + nw_cost2]

        acc0 = const([0])
        acc1 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0, acc1],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        cost0 = rvals[2].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]

        denom = -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.rs)])
        rho = (final_cost - cost0) / denom
        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            [final_cost, rho],
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])


        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])