Example #1
0
    def e_step(self, n_steps=100, eps=1e-5):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples]

        # now alternate mean-field inference for even/odd layers
        def mf_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1,self.depth,2):
                new_psamples[i] = self.hi_given(psamples, i)
            for i in xrange(2,self.depth,2):
                new_psamples[i] = self.hi_given(psamples, i)

            score = 0.
            for i in xrange(1, self.depth):
                score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score)

            return new_psamples, theano.scan_module.until(score < eps)

        new_psamples, updates = scan(
                mf_iteration,
                states = new_psamples,
                n_steps=n_steps)

        return [x[0] for x in new_psamples]
Example #2
0
    def pos_sampling(self, n_steps=50):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [
            T.unbroadcast(T.shape_padleft(psample))
            for psample in self.psamples
        ]

        # now alternate mean-field inference for even/odd layers
        def sample_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1, self.depth, 2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            for i in xrange(2, self.depth, 2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            return new_psamples

        new_psamples, updates = scan(sample_iteration,
                                     states=new_psamples,
                                     n_steps=n_steps)

        return [x[0] for x in new_psamples]
Example #3
0
def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5):
    """
    Performs 'n_steps' of mean-field inference (used to compute positive phase
    statistics)

    Parameters
    ----------
    psamples : array-like object of theano shared variables
        State of each layer of the DBM (during the inference process).
        psamples[0] points to the input
    n_steps :  integer
        Number of iterations of mean-field to perform
    """
    depth = len(psamples)

    new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in psamples]

    # now alternate mean-field inference for even/odd layers
    def mf_iteration(*psamples):
        new_psamples = [p for p in psamples]
        for i in xrange(1, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)
        for i in xrange(2, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)

        score = 0.0
        for i in xrange(1, depth):
            score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])), score)

        return new_psamples, theano.scan_module.until(score < eps)

    new_psamples, updates = scan(mf_iteration, states=new_psamples, n_steps=n_steps)

    return [x[0] for x in new_psamples]
Example #4
0
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(args, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
Example #5
0
    def e_step(self, n_steps=100, eps=1e-5):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [
            T.unbroadcast(T.shape_padleft(psample))
            for psample in self.psamples
        ]

        # now alternate mean-field inference for even/odd layers
        def mf_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1, self.depth, 2):
                new_psamples[i] = self.hi_given(psamples, i)
            for i in xrange(2, self.depth, 2):
                new_psamples[i] = self.hi_given(psamples, i)

            score = 0.
            for i in xrange(1, self.depth):
                score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])),
                                  score)

            return new_psamples, theano.scan_module.until(score < eps)

        new_psamples, updates = scan(mf_iteration,
                                     states=new_psamples,
                                     n_steps=n_steps)

        return [x[0] for x in new_psamples]
Example #6
0
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
Example #7
0
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
    def pos_phase(self, v, init_state, n_steps=1, eps=1e-3):
        """
        Mixed mean-field + sampling inference in positive phase.
        :param v: input being conditioned on
        :param init: dictionary of initial values
        :param n_steps: number of Gibbs updates to perform afterwards.
        """
        def pos_mf_iteration(g1, h1, v, pos_counter):
            h2 = self.h_hat(g1, v)
            s2_1 = self.s1_hat(g1, v)
            s2_0 = self.s0_hat(g1, v)
            g2 = self.g_hat(h2, s2_1, s2_0)
            # stopping criterion
            dl_dghat = T.max(abs(self.dlbound_dg(g2, h2, s2_1, s2_0, v)))
            dl_dhhat = T.max(abs(self.dlbound_dh(g2, h2, s2_1, s2_0, v)))
            stop = T.maximum(dl_dghat, dl_dhhat)
            return [g2, h2, s2_1, s2_0, v, pos_counter + 1], theano.scan_module.until(stop < eps)

        states = [T.unbroadcast(T.shape_padleft(init_state['g'])),
                  T.unbroadcast(T.shape_padleft(init_state['h'])),
                  {'steps': 1},
                  {'steps': 1},
                  T.unbroadcast(T.shape_padleft(v)),
                  T.unbroadcast(T.shape_padleft(0.))]

        rvals, updates = scan(
                pos_mf_iteration,
                states = states,
                n_steps=n_steps)

        return [rval[0] for rval in rvals]
Example #9
0
def scalar_armijo_search(phi,
                         phi0,
                         derphi0,
                         c1=constant(1e-4),
                         n_iters=10,
                         profile=0):
    """
    .. todo::

        WRITEME
    """
    alpha0 = one
    phi_a0 = phi(alpha0)
    alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\
            (phi_a0 - phi0 - derphi0 * alpha0)
    phi_a1 = phi(alpha1)

    csol1 = phi_a0 <= phi0 + c1 * derphi0
    csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0

    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                          one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)

    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    # print 'armijo'
    rvals, _ = scan(armijo,
                    states=states,
                    n_steps=n_iters,
                    name='armijo',
                    mode=theano.Mode(linker='cvm'),
                    profile=profile)

    sol_scan = rvals[1][0]
    a_opt = ifelse(csol1, one, ifelse(csol2, alpha1, sol_scan))
    score = ifelse(csol1, phi_a0, ifelse(csol2, phi_a1, rvals[2][0]))
    return a_opt, score
Example #10
0
def scalar_armijo_search(phi, phi0, derphi0, c1=constant(1e-4),
                         n_iters=10, profile=0):
    alpha0 = one
    phi_a0 = phi(alpha0)
    alpha1 = -(derphi0) * alpha0 ** 2 / 2.0 /\
            (phi_a0 - phi0 - derphi0 * alpha0)
    phi_a1 = phi(alpha1)

    csol1 = phi_a0 <= phi0 + c1 * derphi0
    csol2 = phi_a1 <= phi0 + c1 * alpha1 * derphi0

    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(
            TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(
            TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                  one - alpha2 / alpha1 < 0.96),
            alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)

    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    # print 'armijo'
    rvals, _ = scan(
                armijo,
                states=states,
                n_steps=n_iters,
                name='armijo',
                mode=theano.Mode(linker='cvm'),
                profile=profile)

    sol_scan = rvals[1][0]
    a_opt = ifelse(csol1, one,
                ifelse(csol2, alpha1,
                    sol_scan))
    score = ifelse(csol1, phi_a0,
                   ifelse(csol2, phi_a1,
                          rvals[2][0]))
    return a_opt, score
Example #11
0
def test_005():
    sq = theano.tensor.fvector('sq')
    nst = theano.tensor.iscalar('nst')
    out, _ = scan.scan(lambda s: s+numpy.float32(1),
                       sequences=sq,
                       states=[None],
                       n_steps=nst)
    fn = theano.function([sq, nst], out)
    val_sq = numpy.float32([1, 2, 3, 4, 5])
    assert numpy.all(fn(val_sq, 5) == val_sq + 1)
Example #12
0
def test_001():
    x0 = theano.tensor.fvector('x0')
    state = theano.tensor.unbroadcast(
        theano.tensor.shape_padleft(x0), 0)
    out, _ = scan.scan(lambda x: x+numpy.float32(1),
                           states=state,
                           n_steps=5)
    fn = theano.function([x0], out[0])
    val_x0 = numpy.float32([1, 2, 3])
    assert numpy.all(fn(val_x0) == val_x0 + 5)
Example #13
0
            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               cgv))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}
Example #14
0
def linear_cg_fletcher_reeves(compute_Ax,
                              bs,
                              xinit=None,
                              rtol=1e-6,
                              maxiter=1000,
                              damp=0,
                              floatX=None,
                              profile=0):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)

    def loop(rz_old, *args):
        ps = args[:n_params]
        rs = args[n_params:2 * n_params]
        xs = args[2 * n_params:]
        _Aps = compute_Ax(*ps)
        Aps = [x + damp * y for x, y in zip(_Aps, ps)]
        alpha = rz_old / sum((x * y).sum() for x, y in zip(Aps, ps))
        xs = [x + alpha * p for x, p in zip(xs, ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        rz_new = sum((r * r).sum() for r in rs)
        ps = [r + rz_new / rz_old * p for r, p in zip(rs, ps)]
        return [rz_new]+ps+rs+xs, \
                theano.scan_module.until(abs(rz_new) < rtol)

    if xinit is None:
        r0s = bs
        _x0s = [
            tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)))
            for x in bs
        ]
    else:
        init_Ax = compute_Ax(*xinit)
        r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit]

    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    rz_old = sum((r * r).sum() for r in r0s)
    _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old), 0)
    outs, updates = scan(loop,
                         states=[_rz_old] + _p0s + _r0s + _x0s,
                         n_steps=maxiter,
                         mode=theano.Mode(linker='cvm'),
                         name='linear_conjugate_gradient',
                         profile=profile)
    fxs = outs[1 + 2 * n_params:]
    return [x[0] for x in fxs]
Example #15
0
def test_002():
    x0 = theano.tensor.fvector('x0')
    state = theano.tensor.alloc(
        theano.tensor.constant(numpy.float32(0)),
        6,
        x0.shape[0])
    state = theano.tensor.set_subtensor(state[0], x0)

    out, _ = scan.scan(lambda x: x+numpy.float32(1),
                           states=state,
                           n_steps=5)
    fn = theano.function([x0], out)
    val_x0 = numpy.float32([1, 2, 3])
    assert numpy.all(fn(val_x0)[-1] == val_x0 + 5)
    assert numpy.all(fn(val_x0)[0] == val_x0)
Example #16
0
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates
Example #17
0
def linear_cg_precond(compute_Gv,
                      bs,
                      Msz,
                      rtol=1e-16,
                      maxit=100000,
                      floatX=None):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)

    def loop(rsold, *args):
        ps = args[:n_params]
        rs = args[n_params:2 * n_params]
        xs = args[2 * n_params:]
        Aps = compute_Gv(*ps)
        alpha = rsold / sum((x * y).sum() for x, y in zip(Aps, ps))
        xs = [x + alpha * p for x, p in zip(xs, ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        zs = [r / z for r, z in zip(rs, Msz)]
        rsnew = sum((r * z).sum() for r, z in zip(rs, zs))
        ps = [z + rsnew / rsold * p for z, p in zip(zs, ps)]
        return [rsnew] + ps + rs + xs,

    theano.scan_module.until(abs(rsnew) < rtol)

    r0s = bs
    _p0s = [
        tensor.unbroadcast(tensor.shape_padleft(x / z), 0)
        for x, z in zip(r0s, Msz)
    ]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in r0s]
    _x0s = [
        tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x)), 0)
        for x in bs
    ]
    rsold = sum((r * r / z).sum() for r, z in zip(r0s, Msz))
    _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold), 0)
    outs, updates = scan(loop,
                         states=[_rsold] + _p0s + _r0s + _x0s,
                         n_steps=maxit,
                         mode=theano.Mode(linker='c|py'),
                         name='linear_conjugate_gradient',
                         profile=0)
    fxs = outs[1 + 2 * n_params:]
    return [x[0] for x in fxs]
Example #18
0
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, cgv))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}
Example #19
0
def linear_cg_fletcher_reeves(compute_Ax, bs, xinit = None,
              rtol = 1e-6, maxiter = 1000, damp=0,
              floatX = None, profile=0):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)


    def loop(rz_old, *args):
        ps = args[:n_params]
        rs = args[n_params:2*n_params]
        xs = args[2*n_params:]
        _Aps = compute_Ax(*ps)
        Aps = [x + damp*y for x,y in zip(_Aps, ps)]
        alpha = rz_old/sum( (x*y).sum() for x,y in zip(Aps, ps))
        xs = [x + alpha * p for x,p in zip(xs,ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        rz_new = sum( (r*r).sum() for r in rs)
        ps = [ r + rz_new/rz_old*p for r,p in zip(rs,ps)]
        return [rz_new]+ps+rs+xs, \
                theano.scan_module.until(abs(rz_new) < rtol)

    if xinit is None:
        r0s = bs
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(x))) for x in bs]
    else:
        init_Ax = compute_Ax(*xinit)
        r0s = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        _x0s = [tensor.unbroadcast(tensor.shape_padleft(xi)) for xi in xinit]

    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    rz_old = sum( (r*r).sum() for r in r0s)
    _rz_old = tensor.unbroadcast(tensor.shape_padleft(rz_old),0)
    outs, updates = scan(loop,
                         states = [_rz_old] + _p0s + _r0s + _x0s,
                         n_steps = maxiter,
                         mode = theano.Mode(linker='cvm'),
                         name = 'linear_conjugate_gradient',
                         profile=profile)
    fxs = outs[1+2*n_params:]
    return [x[0] for x in fxs]
Example #20
0
def test_003():
    x0 = theano.tensor.fvector('x0')
    sq = theano.tensor.fvector('sq')
    state = theano.tensor.alloc(
        theano.tensor.constant(numpy.float32(0)),
        6,
        x0.shape[0])
    state = theano.tensor.set_subtensor(state[0], x0)

    out, _ = scan.scan(lambda s, x: x+s,
                           sequences=sq,
                           states=state,
                           n_steps=5)
    fn = theano.function([sq, x0], out)
    val_x0 = numpy.float32([1, 2, 3])
    val_sq = numpy.float32([1, 2, 3, 4, 5])
    assert numpy.all(fn(val_sq, val_x0)[-1] == val_x0 + 15)
    assert numpy.all(fn(val_sq, val_x0)[0] == val_x0)
Example #21
0
def _e_step(psamples, W_list, b_list, n_steps=100, eps=1e-5):
    """
    Performs 'n_steps' of mean-field inference (used to compute positive phase
    statistics)

    Parameters
    ----------
    psamples : array-like object of theano shared variables
        State of each layer of the DBM (during the inference process).
        psamples[0] points to the input
    n_steps :  integer
        Number of iterations of mean-field to perform
    """
    depth = len(psamples)

    new_psamples = [T.unbroadcast(T.shape_padleft(psample))
                    for psample in psamples]

    # now alternate mean-field inference for even/odd layers
    def mf_iteration(*psamples):
        new_psamples = [p for p in psamples]
        for i in xrange(1, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)
        for i in xrange(2, depth, 2):
            new_psamples[i] = hi_given(psamples, i, W_list, b_list)

        score = 0.
        for i in xrange(1, depth):
            score = T.maximum(T.mean(abs(new_psamples[i] - psamples[i])),
                              score)

        return new_psamples, theano.scan_module.until(score < eps)

    new_psamples, updates = scan(
        mf_iteration,
        states=new_psamples,
        n_steps=n_steps
    )

    return [x[0] for x in new_psamples]
Example #22
0
    def pos_sampling(self, n_steps=50):
        """
        Performs `n_steps` of mean-field inference (used to compute positive phase statistics).
        :param psamples: list of tensor-like objects, representing the state of each layer of
        the DBM (during the inference process). psamples[0] points to self.input.
        :param n_steps: number of iterations of mean-field to perform.
        """
        new_psamples = [T.unbroadcast(T.shape_padleft(psample)) for psample in self.psamples]

        # now alternate mean-field inference for even/odd layers
        def sample_iteration(*psamples):
            new_psamples = [p for p in psamples]
            for i in xrange(1,self.depth,2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            for i in xrange(2,self.depth,2):
                new_psamples[i] = self.sample_hi_given(psamples, i)
            return new_psamples

        new_psamples, updates = scan(
                sample_iteration,
                states = new_psamples,
                n_steps=n_steps)

        return [x[0] for x in new_psamples]
Example #23
0
def linear_cg_precond(compute_Gv, bs, Msz, rtol = 1e-16, maxit = 100000, floatX = None):
    """
    assume all are lists all the time
    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(bs)
    def loop(rsold, *args):
        ps = args[:n_params]
        rs = args[n_params:2*n_params]
        xs = args[2*n_params:]
        Aps = compute_Gv(*ps)
        alpha = rsold/sum( (x*y).sum() for x,y in zip(Aps, ps))
        xs = [x + alpha * p for x,p in zip(xs,ps)]
        rs = [r - alpha * Ap for r, Ap, in zip(rs, Aps)]
        zs = [ r/z for r,z in zip(rs, Msz)]
        rsnew = sum( (r*z).sum() for r,z in zip(rs,zs))
        ps = [ z + rsnew/rsold*p for z,p in zip(zs,ps)]
        return [rsnew]+ps+rs+xs, 
    theano.scan_module.until(abs(rsnew) < rtol)

    r0s = bs
    _p0s = [tensor.unbroadcast(tensor.shape_padleft(x/z),0) for x,z in zip(r0s, Msz)]
    _r0s = [tensor.unbroadcast(tensor.shape_padleft(x),0) for x in r0s]
    _x0s = [tensor.unbroadcast(tensor.shape_padleft(
        tensor.zeros_like(x)),0) for x in bs]
    rsold = sum( (r*r/z).sum() for r,z in zip(r0s, Msz))
    _rsold = tensor.unbroadcast(tensor.shape_padleft(rsold),0)
    outs, updates = scan(loop,
                         states = [_rsold] + _p0s + _r0s + _x0s,
                         n_steps = maxit,
                         mode = theano.Mode(linker='c|py'),
                         name = 'linear_conjugate_gradient',
                         profile=0)
    fxs = outs[1+2*n_params:]
    return [x[0] for x in fxs]
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000 
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(
        rng,
        n_in=state['nins'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_%d'%si))
        if state['rec_gating']:
            gater_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='gater_words_%d'%si))
        if state['rec_reseting']:
            reseter_words.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias = False,
                name='reseter_words_%d'%si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias = False, 
                    name='rec_proj_reseter_%d'%si))

        add_rec_step.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_%d'%si))

    def _add_op(words_embeddings, 
                words_mask=None,
                prev_val=None,
                si = 0, 
                state_below = None,
                gater_below = None,
                reseter_below = None,
                one_step=False, 
                bs=1, 
                init_state=None, 
                use_noise=True):
        seqlen = words_embeddings.out.shape[0]//bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si-1](state_below, one_step=one_step, 
                    use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
            
        if not one_step:
            rval= add_rec_step[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        else:
            rval= add_rec_step[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                gater_below = gater,
                reseter_below = reseter,
                one_step=one_step,
                init_state=init_state,
                use_noise = use_noise)
        return rval
    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(
        rng,
        n_in=state['nouts'],
        n_hids=[state['rank_n_approx']],
        activation=[state['rank_n_activ']],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        scale=state['weight_scale'],
        name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='emb_words_t_%d'%si))
        if state['rec_gating']:
            gater_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='gater_words_t_%d'%si))
        if state['rec_reseting']:
            reseter_words_t.append(MultiLayer(
                rng,
                n_in=state['rank_n_approx'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                learn_bias=False,
                name='reseter_words_t_%d'%si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[embdim],
            activation=['lambda x:x'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            name='proj_everything_t_%d'%si,
            learn_bias = False))
        if state['rec_gating']:
            gater_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='gater_everything_t_%d'%si,
                learn_bias = False))
        if state['rec_reseting']:
            reseter_everything_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation=['lambda x:x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='reseter_everything_t_%d'%si,
                learn_bias = False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[embdim],
                activation=['lambda x:x'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['rec_weight_scale'],
                name='rec_proj_%d'%si))
            if state['rec_gating']:
                rec_proj_t_gater.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_gater_%d'%si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(MultiLayer(
                    rng,
                    n_in=state['dim'],
                    n_hids=[state['dim']],
                    activation=['lambda x:x'],
                    init_fn=state['weight_init_fn'],
                    weight_noise=state['weight_noise'],
                    scale=state['weight_scale'],
                    learn_bias=False,
                    name='rec_proj_t_reseter_%d'%si))

        add_rec_step_t.append(eval(state['rec_layer'])(
                rng,
                n_hids=state['dim'],
                activation = state['activ'],
                bias_scale = state['bias'],
                scale=state['rec_weight_scale'],
                init_fn=state['rec_weight_init_fn'],
                weight_noise=state['weight_noise_rec'],
                dropout=state['dropout_rec'],
                gating=state['rec_gating'],
                gater_activation=state['rec_gater'],
                reseting=state['rec_reseting'],
                reseter_activation=state['rec_reseter'],
                name='add_h_t_%d'%si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim'] * state['maxout_part']],
                activation=['lambda x: x'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                scale=state['weight_scale'],
                name='encoder_proj_%d'%si,
                learn_bias = (si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), 
                indim = indim, pieces = pieces, rng=rng)

    def _add_t_op(words_embeddings, everything = None, words_mask=None,
                prev_val=None,one_step=False, bs=1, 
                init_state=None, use_noise=True,
                gater_below = None,
                reseter_below = None,
                si = 0, state_below = None):
        seqlen = words_embeddings.out.shape[0]//bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si-1](state_below, 
                    one_step=one_step, use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, 
                        use_noise = use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](
                rval,
                nsteps=seqlen,
                batch_size=bs,
                mask=words_mask,
                one_step=one_step,
                init_state=init_state,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        else:
            rval = add_rec_step_t[si](
                rval,
                mask=words_mask,
                state_before=prev_val,
                one_step=one_step,
                gater_below = gater,
                reseter_below = reseter,
                use_noise = use_noise)
        return rval
    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(MultiLayer(
                rng,
                n_in=state['dim'],
                n_hids=[state['dim']],
                activation = [state['activ']],
                bias_scale = [state['bias']],
                scale=state['weight_scale'],
                init_fn=state['weight_init_fn'],
                weight_noise=state['weight_noise'],
                name='bias_code_%d'%si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(
            rng,
            n_in=word_code_nin,
            n_hids=[outdim],
            activation = 'lambda x:x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            learn_bias = False,
            name='word_code')

    proj_code = MultiLayer(
        rng,
        n_in=state['dim'],
        n_hids=[outdim],
        activation = 'lambda x: x',
        bias_scale = [state['bias_mlp']/3],
        scale=state['weight_scale'],
        init_fn=state['weight_init_fn'],
        weight_noise=state['weight_noise'],
        learn_bias = False,
        name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(MultiLayer(
            rng,
            n_in=state['dim'],
            n_hids=[outdim],
            activation = 'lambda x: x',
            bias_scale = [state['bias_mlp']/3],
            scale=state['weight_scale'],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            name='proj_h_%d'%si))

    if state['bigram']:
        proj_word = MultiLayer(
            rng,
            n_in=state['rank_n_approx'],
            n_hids=[outdim],
            activation=['lambda x:x'],
            bias_scale = [state['bias_mlp']/3],
            init_fn=state['weight_init_fn'],
            weight_noise=state['weight_noise'],
            scale=state['weight_scale'],
            learn_bias = False,
            name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(
            rng,
            indim, 
            state['nouts'],
            state['weight_scale'],
            -1, 
            rank_n_approx = rank_n_approx,
            rank_n_activ = rank_n_activ,
            weight_noise=state['weight_noise'],
            init_fn=state['weight_init_fn'],
            name='out')

    def _pop_op(everything, accum, everything_max = None,
            everything_min = None, word = None, aword = None,
            one_step=False, use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1,state['decoder_stack']):
            rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise), 
                            one_step=one_step, use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise), 
                        one_step=one_step, use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1], outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, 
        bs=x_mask.shape[1], 
        si=0, gater_below=gater_below, reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x)), 
                x_mask, bs=x_mask.shape[1], 
                si=si, state_below=encoder_acts[-1], 
                gater_below=gater_below,
                reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True,n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape([1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True,n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [add_t_op(emb_words_t[0](emb_t(y0)), 
            everything,
            y_mask, bs=y_mask.shape[1], 
            gater_below = gater_below,
            reseter_below = reseter_below,
            init_state=init_state[0], 
            si=0)]
    for si in xrange(1,state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), 
                everything,
                y_mask, bs=y_mask.shape[1], 
                state_below = has_said[-1],
                gater_below = gater_below,
                reseter_below = reseter_below,
                init_state=init_state[si], 
                si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword = aword)

    nll = output_layer.train(state_below=model, target=y0,
                 mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), 
            si=0, 
            use_noise=False, 
            gater_below=gater_below,
            reseter_below=reseter_below)]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1,state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), 
                si=si, 
                state_below=encoder_acts[-1], use_noise=False,
                gater_below = gater_below, 
                reseter_below = reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(TT.reshape(bias_code[si](everything, 
                use_noise=False), [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x,use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0; word_tm1 = args[aidx]
        aidx += 1; prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1; has_said_tm1.append(args[aidx])
        aidx += 1; ctx = args[aidx]
        if state['avg_word']:
            aidx += 1; awrd = args[aidx]
        
        val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1,
                aword=awrd, one_step=True, use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(
                state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), 
                temp=temp, target=sample.reshape([1,1]), use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), 
                ctx,
                prev_val=has_said_tm1[0], 
                gater_below=gater_below,
                reseter_below=reseter_below,
                one_step=True, use_noise=True,
                si=0)]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), 
                    ctx,
                    prev_val=has_said_tm1[si], 
                    gater_below=gater_below,
                    reseter_below=reseter_below,
                    one_step=True, use_noise=True,
                    si=si, state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
            states = states,
            params = sampler_params,
            n_steps= n_steps,
            name='sampler_scan'
            )
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function(
        [n_steps, temp, x], [samples, probs.sum()],
        updates=updates,
        profile=False, name='sample_fn')

    model = LM_Model(
        cost_layer = nll,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        sample_fn  = sample_fn,
        clean_before_noise_fn = False,
        noise_fn = noise_fn,
        indx_word=state['indx_word_target'],
        indx_word_src=state['indx_word'],
        character_level = False,
        rng = rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:,idx].shape[0]):
                        print model.word_indxs_src[x[:,idx][k]],
                        if model.word_indxs_src[x[:,idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:,idx].shape[0]):
                        print model.word_indxs[y[:,idx][k]],
                        if model.word_indxs[y[:,idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:,idx])
                    if len(numpy.where(masks[:,idx]==0)[0]) > 0:
                        senlen = numpy.where(masks[:,idx]==0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen']+1,  1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen']+1,  1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data, valid_data, None, model, algo, state, channel,
            reset = state['reset'], hooks = hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word=pkl.load(open(state['word_indx'],'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen+1, dtype='int64')
                    for idx,sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass
Example #25
0
    def init_cpu(self, options, channel, data, model):
        n_params = len(self.model.params)
        # Step 1. Compile function for computing eucledian gradients
        self.reset_gradients = theano.function(
            [],
            [],
            updates = zip(self.gs, [TT.zeros_like(g) for g in self.gs]),
            on_unused_input='warn',
            mode=cpu_mode,
            name='reset_gradients',
            profile=options['profile'])

        gbdx = TT.iscalar('grad_batch_idx')
        comp_grad = TT.iscalar('comp_grad')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))
        cst = time.time()
        def grad_step(*args):

            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)]
            _gs = [x for x in gs]
            _nw_gs = [gpu_from_host(g) for g in nw_gs]
            nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True)
            nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)]
            return [args[0] + const(1), args[1] + nw_cost] + nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        cost0 = TT.unbroadcast(const([0]),0)
        n_steps = TT.iscalar('nsteps')
        rvals, updates = scan(grad_step,
                              states=[idx0, cost0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / TT.cast(n_steps, 'float32') for x in rvals[2: 2 + n_params]]
        nw_gs = [og + nwg for og, nwg in zip(self.gs, nw_gs)]
        fcost = rvals[1][0] / TT.cast(n_steps, 'float32')
        updates.update(dict(zip(self.gs, nw_gs)))

        grad_inps = zip(loc_inputs, self.shared_data)
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx, comp_grad, n_steps],
            fcost,
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])
        print 'Time to compile grad', print_time(time.time() - cst)
        cst = time.time()
        def jacob_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']:(idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            mode=cpu_mode
            params = model.cpu_params
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):

                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                    denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= nw_out
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1:1+n_params], params)]
            return [args[0] + const(1)] + nw_js

        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['mbs'] // options['cbs']
        mode = cpu_mode
        rvals, updates = scan(jacob_step,
                              states=[idx0] + ij,
                              n_steps=n_steps,
                              name='jacob_loop',
                              mode=mode,
                              profile=options['profile'])

        nw_js = [x[0] for x in rvals[1:1+n_params]]
        updates.update(dict(zip(self.js, nw_js)))
        grad_inps = [(x, y[gbdx*options['mbs']:(gbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs[:1], self.cpu_shared_data[:1])]

        print 'Compiling grad function'
        self.compute_jacobi_preconditioner = theano.function(
            [gbdx],
            [],
            updates=updates,
            on_unused_input='warn',
            givens=dict(grad_inps),
            name='jacobi_preconditioner_gradients',
            mode=mode,
            profile=options['profile'])
        print 'Time compile jacobi ', print_time(time.time() - cst)
        cst = time.time()
        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        mode = cpu_mode
        def compute_Gv(*args):
            cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                 name ='cgv%d'%idx)
                       for idx, shp in enumerate(model.params_shape)]
            print_mem('allocated mem for cgv')
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=gpu_mode,
                                  name='Gv_step',
                                  profile=options['profile'])
            final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
            grad_inps = zip(loc_inputs, self.shared_data)
            loc_fn = theano.function([],
                                     final_Gvs,
                                     updates = updates,
                                     givens = dict(grad_inps),
                                     on_unused_input='warn',
                                     mode=gpu_mode,
                                     name='loc_fn',
                                     profile = options['profile'])
            fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

            return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        mreg = TT.scalar('mreg')
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift = - mreg,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [mreg],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        nw_ps = [p - lr * r for p, r in zip(model.cpu_params, self.rs)]
        nw_ds = [ -r for r in self.rs]

        self.update_cparams = theano.function(
            [lr], updates = dict(zip(model.cpu_params, nw_ps)),
            name='update_cparam',
            allow_input_downcast=True,
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        newparams = [y.type.filter_variable(x) for x,y in zip(nw_ps,
                                                              model.params)]
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=cpu_mode,
                                             profile=options['profile'])
        self.scalar_grad = theano.function(
            [],
            sum(TT.sum(x*y) for x,y in zip(self.gs, self.ds)),
            name='scalar_grad',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        nsteps = self.options['ebs'] // self.options['cbs']
        self.current_alpha = numpy.inf
        def ls_cost(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            return self.compute_eucledian_gradients(pos, 0, nsteps)
        self.ls_cost_fn = ls_cost

        def ls_grad(alpha, pos):
            if alpha != self.current_alpha:
                self.current_alpha = alpha
                self.update_params(alpha)
            self.reset_gradients()
            self.compute_eucledian_gradients(pos, 1, nsteps)
            return self.scalar_grad()
        self.ls_grad_fn = ls_grad

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            replace.update(dict(zip(model.params, model.cpu_params)))
            nw_cost = \
                  TT.cast(safe_clone(model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                           ferr,
                           givens=dict(zip(loc_inputs, self.cpu_shared_data)),
                           name='compute_err',
                           mode=cpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)

        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Example #26
0
def minres(compute_Av,
           bs,
           rtol=npy_floatX(1e-6),
           maxiter=20,
           Ms=None,
           damp=npy_floatX(0.),
           maxxnorm=npy_floatX(1e15),
           Acondlim=npy_floatX(1e16),
           mode = None,
           xinit = None,
           profile=0):
    """
     DESCRIPTION:
         minres attempts to find the minimum-length and minimum-residual-norm
         solution x to the system of linear equations A*x = b or
         least squares problem min||Ax-b||.  The n-by-n coefficient matrix A must be symmetric (but need not be positive definite or invertible).
         The right-hand-side column vector b must have length n.

     INPUTS:
        :param compute_Av: callable returing the symbolic expression for
            `Av`. `v` can be a set of parameteres
        :param bs: list of Theano expressions. We are looking to compute
            A^-1\dot bs
        :param rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        :param maxiter: Optional, positive integer, specifies the maximum number of
            iterations. Default is 20
        :param Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        :param damp: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A + damp I) * x = b.
        :param maxxnorm: real positive, maximum bound on NORM(x). Default is 1e14.
        :param Acondlim: real positive, maximum bound on COND(A). Default is 1e15.
        :param xinit: None, or list of ndarrays (of same length as bs) containing initial guess
        for x[i].

     OUTPUTS:
        x       n-vector, estimated solution
        flag    integer, convergence flag
               -1  beta2 = 0.  If M = I, b and x are eigenvectors.
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9 It is a least squares problem but no converged solution yet.
        iter    integer, iteration number at which x was computed: 0 <= iter <= maxiter.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

    EXAMPLE 1:
         n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n);
         b = sum(A,2); rtol = 1e-10; maxiter = 50; M = spdiags(4*on,0,n,n);
         x = minresSOL69(A, b, rtol, maxiter, M);

         Use this matrix-vector product function
            function y = afun(x,n)
            y = 4 * x;
            y(2:n) = y(2:n) - 2 * x(1:n-1);
            y(1:n-1) = y(1:n-1) - 2 * x(2:n);
         as input to minresSOL69
            x1 = minresSOL69(@afun, b, rtol, maxiter, M);

     EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite.
          n = 50; N = n^2; on=ones(n,1);   B = spdiags([on on on], -1:1, n, n);
          A = sparse([],[],[],N,N,(3*n-2)^2);
          for i=1:n
              A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B;
              if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end;
              if (i-2)*n+1 > 0  A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B;  end;
          end
          b = sum(A,2);   rtol = 1e-5;   maxxnorm = 1e2;
          damp = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show);

     EXAMPLE 3: A is diagonal, singular and indefinite.
          h = 1;  a = -10; b = -a; n = 2*b/h + 1;
          A = spdiags((a:h:b)', 0, n, n);
          b = ones(n,1);   rtol = 1e-6;   maxxnorm = 1e2;
          damp = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, damp, maxxnorm, Acondlim, show);



     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = npy_floatX(1e-23)

    # Initialise
    flag = theano.shared(npy_floatX(0.))

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    if xinit is None:
        xinit = [TT.zeros_like(b) for b in bs]
        r3s = [b for b in bs]
        r2s = [b for b in bs]
        r1s = [b for b in bs]
        beta1 = norm(bs)
        if Ms is not None:
            r3s = [b/m for b,m in zip(bs,Ms)]
            beta1 = norm(r3s, bs)
    else:
        init_Ax = compute_Av(*xinit)
        res = [bs[i] - init_Ax[i] for i in xrange(len(bs))]
        r3s = copy.copy(res)
        r2s = copy.copy(res)
        r1s = copy.copy(res)
        beta1 = norm(res)
        if Ms is not None:
            r3s = [r/m for r,m in zip(r3s, Ms)]
            beta1 = norm(r3s, res)

    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3/beta for r3 in r3s]
        r3s = compute_Av(*vs)
        r3s = [r3 + damp*v for r3,v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, numpy.float64(1.)),
                         r3 - (beta/betal)*r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = sqnorm(r3s, vs)
        r3s = [r3 - (alpha/beta)*r2 for r3,r2 in zip(r3s,r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3/M for r3, M in zip(r3s, Ms)]
            betan = norm(r2s, r3s)
        else:
            betan = norm(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, npy_floatX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))


        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs*dbar + sn*alpha
        gbar = sn*dbar - cs*alpha

        eplnn = sn*betan
        dbarn = - cs*betan;

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal  = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs*phi
        phi = sn*phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, npy_floatX(0.)),
                        (v - epln*dl2 - dlta*dl)/gamma,
                        v)
              for v,dl2,dl in zip(vs,dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma,npy_floatX(0.)),
                           norm(ds),
                           TT.constant((npy_floatX(numpy.inf))))


        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau*d for x,d in zip(xs,ds)]

        xnorm = norm(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2,
                        x) for dl2,x in zip(dl2s,xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         npy_floatX(6.), flag)
        # Estimate various norms
        rnorml      = rnorm # ||r_{k-1}||
        Anorml      = Anorm
        Acondl      = Acond
        relrnorml   = relrnorm
        flag_no_6 = TT.neq(flag, npy_floatX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, norm(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm*xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, npy_floatX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml*rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = npy_floatX(1) + relrnorm
        t2 = npy_floatX(1) + relArnorml
        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, npy_floatX(0.)),
                          TT.eq(flag, npy_floatX(6.))),
                      TT.switch(TT.le(t1, npy_floatX(1.)),
                                npy_floatX(3.),
                      TT.switch(TT.le(t2, npy_floatX(1.)),
                                npy_floatX(4.),
                      TT.switch(TT.le(relrnorm, rtol),
                                npy_floatX(1.),
                      TT.switch(TT.le(Anorm, npy_floatX(1e-20)),
                                npy_floatX(12),
                      TT.switch(TT.le(relArnorml, rtol),
                                npy_floatX(10.),
                      TT.switch(TT.ge(epsx, beta1),
                                npy_floatX(5.),
                      TT.switch(TT.ge(xnorm, maxxnorm),
                                npy_floatX(6.),
                      TT.switch(TT.ge(niter, TT.cast(maxiter,floatX)),
                                npy_floatX(8.),
                                flag)))))))),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol*Anorm*xnorm),
                               npy_floatX(11.), flag)
        return [
            niter + npy_floatX(1.),
            beta,
            betan,
            phi,
            Acond,
            cs,
            dbarn,
            eplnn,
            rnorm,
            sn,
            Tnorm,
            rnorml,
            xnorm,
            Dnorm,
            gamma,
            pnorm,
            gammal,
            Axnorm,
            relrnorm,
            relArnorml,
            Anorm,
            flag] + xs + r1s + r2s + r3s + dls + ds, \
                theano.scan_module.scan_utils.until(TT.neq(flag,0))

    states = []
    # 0 niter
    states.append(TT.constant(npy_floatX([0])))
    # 1 beta
    states.append(TT.constant(npy_floatX([0])))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 4 Acond
    states.append(TT.constant(npy_floatX([1])))
    # 5 cs
    states.append(TT.constant(npy_floatX([-1])))
    # 6 dbarn
    states.append(TT.constant(npy_floatX([0])))
    # 7 eplnn
    states.append(TT.constant(npy_floatX([0])))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 9 sn
    states.append(TT.constant(npy_floatX([0])))
    # 10 Tnorm
    states.append(TT.constant(npy_floatX([0])))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1),0))
    # 12 xnorm
    states.append(TT.constant(npy_floatX([0])))
    # 13 Dnorm
    states.append(TT.constant(npy_floatX([0])))
    # 14 gamma
    states.append(TT.constant(npy_floatX([0])))
    # 15 pnorm
    states.append(TT.constant(npy_floatX([0])))
    # 16 gammal
    states.append(TT.constant(npy_floatX([0])))
    # 17 Axnorm
    states.append(TT.constant(npy_floatX([0])))
    # 18 relrnorm
    states.append(TT.constant(npy_floatX([1])))
    # 19 relArnorml
    states.append(TT.constant(npy_floatX([1])))
    # 20 Anorm
    states.append(TT.constant(npy_floatX([0])))
    # 21 flag
    states.append(TT.constant(npy_floatX([0])))

    xs  = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    ds  = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    dls = [TT.unbroadcast(TT.shape_padleft(xi),0) for xi in xinit]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1),0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2),0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3),0) for r3 in r3s]

    rvals, lupds = scan(loop,
                    states = states + xs + r1s + r2s + r3s + dls + ds,
                    n_steps = maxiter + numpy.int32(1),
                    name='minres',
                    profile=profile,
                    mode=mode)

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22+n_params]]
    return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm
Example #27
0
        def compute_Gv(*args):
            cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                 name ='cgv%d'%idx)
                       for idx, shp in enumerate(model.params_shape)]
            print_mem('allocated mem for cgv')
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(cgv, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * nw_out
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs']) * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=gpu_mode,
                                  name='Gv_step',
                                  profile=options['profile'])
            final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
            grad_inps = zip(loc_inputs, self.shared_data)
            loc_fn = theano.function([],
                                     final_Gvs,
                                     updates = updates,
                                     givens = dict(grad_inps),
                                     on_unused_input='warn',
                                     mode=gpu_mode,
                                     name='loc_fn',
                                     profile = options['profile'])
            fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

            return fake_op(*args), {}
Example #28
0
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile = False,
          mode=theano.Mode(linker='cvm')):
    """
    TODO: re-write me

    Part of the optimization algorithm in `scalar_search_wolfe2`.
    a_lo : scalar (step size)
    a_hi : scalar (step size)
    phi_lo : scalar (value of f at a_lo)
    phi_hi : scalar ( value of f at a_hi)
    derphi_lo : scalar ( value of derivative at a_lo)
    phi : callable -> generates computational graph
    derphi: callable -> generates computational graph
    phi0 : scalar ( value of f at 0)
    derphi0 : scalar (value of the derivative at 0)
    c1 : scalar  (wolfe parameter)
    c2 : scalar  (wolfe parameter)
    profile: if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi-a_lo
        a = TT.switch( dalpha < zero, a_hi, a_lo)
        b = TT.switch( dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1*dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2*dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad)


        # pick between the two ..
        cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b -
                                                            cchk, a_j_cubic
                                                            < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                         phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2*derphi0)


        cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj*(a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse( cond1, phi_hi,
                            TT.switch( cond2, phi_hi, phi_lo), name =
                         'phi_rec')
        a_rec   = ifelse( cond1, a_hi,
                            TT.switch( cond2, a_hi, a_lo), name='a_rec')
        a_hi    = ifelse( cond1, a_j,
                            TT.switch( cond2, a_lo, a_hi), name='a_hi')
        phi_hi  = ifelse( cond1, phi_aj,
                            TT.switch( cond2, phi_lo, phi_hi), name='phi_hi')

        a_lo      = TT.switch(cond1, a_lo, a_j)
        phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj,
                                                  nan), name='valprime')

        return ( [ phi_rec,
                  a_rec,
                  a_lo,
                  a_hi,
                  phi_hi,
                  phi_lo,
                  derphi_lo,
                  a_star,
                  val_star,
                  valprime],
                theano.scan_module.scan_utils.until(stop) )

    maxiter = n_iters
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))  # cubic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))  # quadratic interpolant check
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi-a_lo
    a = TT.switch( dalpha < zero, a_hi, a_lo)
    b = TT.switch( dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection


    # quadric interpolation
    qchk = delta2*dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a +
                                                  qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j)


    # Check new value of a_j

    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)



    cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj*(a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse( cond1, phi_hi,
                        TT.switch( cond2, phi_hi, phi_lo), name='mphirec')
    a_rec   = ifelse( cond1, a_hi,
                        TT.switch( cond2, a_hi, a_lo), name='marec')
    a_hi    = ifelse( cond1, a_j,
                        TT.switch( cond2, a_lo, a_hi), name='mahi')
    phi_hi  = ifelse( cond1, phi_aj,
                        TT.switch( cond2, phi_lo, phi_hi), name='mphihi')

    onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                       phi_aj < phi_lo),
                       abs(derphi_aj) <= -c2*derphi0)

    a_lo      = TT.switch(cond1, a_lo, a_j)
    phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    print'while_zoom'
    outs, updates = scan(while_zoom,
                         states = states,
                         n_steps = maxiter,
                         name = 'while_zoom',
                         mode = mode,
                         profile = profile)
    print 'done_while'
    a_star   = ifelse(onlyif, a_j   , outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime
Example #29
0
def krylov_subspace(compute_Av,
                    bs,
                    old_dir,
                    iters=20,
                    param_shapes=None,
                    profile=0,
                    device='gpu'):
    eps = numpy.float32(1e-20)
    bs = [b / tensor.sqrt((b**2).sum() + eps) for b in bs]
    mem_bufs = [
        tensor.alloc(zero, iters, *param_sh) for param_sh in param_shapes
    ]
    mem_bufs = [
        tensor.set_subtensor(mem[0], b) for mem, b in zip(mem_bufs, bs)
    ]

    def construct_space(*args):
        vs, updates = compute_Av(*args)
        # I need to rescale at every point, otherwise if A is damping, these
        # vs go quickly to 0 and we loose the direction they represent
        norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20)
        vs = [v / norm for v in vs]
        return vs, updates

    if device == 'gpu':
        mode = gpu_mode
    else:
        mode = cpu_mode
    outs, updates = scan(construct_space,
                         states=mem_bufs,
                         n_steps=iters - 2,
                         name='krylov_space',
                         mode=mode,
                         profile=profile)
    if not isinstance(outs, (list, tuple)):
        outs = [outs]
    outs = [
        tensor.set_subtensor(out[iters - 1], o)
        for out, o in zip(outs, old_dir)
    ]
    outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0) for x in outs]
    param_lengths = [numpy.prod(shp) for shp in param_shapes]

    def ortho(idx, *ortho_mats):
        new_ortho_mats = []
        for A, param_length in zip(ortho_mats, param_lengths):
            weight = tensor.dot(
                A[idx + 1:].reshape((iters - idx - 1, param_length)),
                A[idx].reshape((param_length, )))
            A_reshuffle = ['x'] + list(range(A[idx].ndim))
            W_reshuffle = [0] + ['x'] * A[idx].ndim
            to_remove = weight.dimshuffle(*W_reshuffle) *\
                        A[idx].dimshuffle(*A_reshuffle)
            new_A = tensor.set_subtensor(A[idx + 1:], A[idx + 1:] - to_remove)
            x_col = new_A[idx + 1]
            x_col = x_col / tensor.sqrt((x_col**2).sum() + eps)
            new_A = tensor.set_subtensor(new_A[idx + 1], x_col)
            new_ortho_mats.append(new_A)
        return new_ortho_mats

    rvals, _ = scan(ortho,
                    sequences=tensor.constant(numpy.arange(iters - 1)),
                    states=outs,
                    n_steps=iters - 1,
                    name='ortho',
                    profile=profile,
                    mode=mode)
    if not isinstance(rvals, (list, tuple)):
        rvals = [rvals]
    rvals = [rval[0] * .1 for rval in rvals]
    return rvals, updates
Example #30
0
def scalar_search_wolfe2(phi,
                         derphi,
                         phi0=None,
                         old_phi0=None,
                         derphi0=None,
                         n_iters=20,
                         c1=1e-4,
                         c2=0.9,
                        profile=False):
    """
    Find alpha that satisfies strong Wolfe conditions.

    alpha > 0 is assumed to be a descent direction.

    Parameters
    ----------
    phi : callable f(x)
        Objective scalar function.
    derphi : callable f'(x)
        Objective function derivative (can be None)
    phi0 : float, optional
        Value of phi at s=0
    old_phi0 : float, optional
        Value of phi at previous point
    derphi0 : float, optional
        Value of derphi at s=0
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
    profile : flag (boolean)
        True if you want printouts of profiling information

    Returns
    -------
    alpha_star : float
        Best alpha
    phi_star: WRITEME
        phi at alpha_star
    phi0: WRITEME
        phi at 0
    derphi_star: WRITEME
        derphi at alpha_star

    Notes
    -----
    Uses the line search algorithm to enforce strong Wolfe
    conditions.  See Wright and Nocedal, 'Numerical Optimization',
    1999, pg. 59-60.

    For the zoom phase it uses an algorithm by [...].

    """

    if phi0 is None:
        phi0 = phi(zero)
    else:
        phi0 = phi0

    if derphi0 is None and derphi is not None:
        derphi0 = derphi(zero)
    else:
        derphi0 = derphi0

    alpha0 = zero
    alpha0.name = 'alpha0'
    if old_phi0 is not None:
        alpha1 = TT.minimum(one,
                            numpy.asarray(1.01, dtype=theano.config.floatX) *
                            numpy.asarray(2, dtype=theano.config.floatX) * \
                            (phi0 - old_phi0) / derphi0)
    else:
        old_phi0 = nan
        alpha1 = one

    alpha1 = TT.switch(alpha1 < zero, one, alpha1)
    alpha1.name = 'alpha1'

    # This shouldn't happen. Perhaps the increment has slipped below
    # machine precision?  For now, set the return variables skip the
    # useless while loop, and raise warnflag=2 due to possible imprecision.
    phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0)
    # I need a lazyif for alpha1 == 0 !!!
    phi_a1 = ifelse(TT.eq(alpha1, zero), phi0,
                    phi(alpha1), name='phi_a1')
    phi_a1.name = 'phi_a1'

    phi_a0 = phi0
    phi_a0.name = 'phi_a0'
    derphi_a0 = derphi0
    derphi_a0.name = 'derphi_a0'
    # Make sure variables are tensors otherwise strange things happen
    c1 = TT.as_tensor_variable(c1)
    c2 = TT.as_tensor_variable(c2)
    maxiter = n_iters

    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2 * derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name='alphastar_c3'),
                      name='alphastar_c2'),
                      name='alphastar_c1')

        return ([alpha1,
                 nw_alpha1,
                 phi_a1,
                 ifelse(lazy_or('allconds',
                                cond1,
                                cond2,
                                cond3),
                        phi_a1,
                        nw_phi,
                        name='nwphi1'),
                 ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                 i_t + one,
                 alpha_star,
                 phi_star,
                 derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',
                            TT.eq(nw_alpha1, zero),
                            cond1,
                            cond2,
                            cond3)))
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)]
    # i_t
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # alpha_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # phi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # derphi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # print 'while_search'
    outs, updates = scan(while_search,
                         states=states,
                         n_steps=maxiter,
                         name='while_search',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while_search'
    out3 = outs[-3][0]
    out2 = outs[-2][0]
    out1 = outs[-1][0]
    alpha_star, phi_star, derphi_star = \
            ifelse(TT.eq(alpha1, zero),
                        (nan, phi0, nan),
                        (out3, out2, out1), name='main_alphastar')
    return alpha_star, phi_star,  phi0, derphi_star
Example #31
0
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(cgv, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}
Example #32
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']],
                               name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
        else:
            # Store eucledian gradients
            self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]
            # Store riemannian gradients
            self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                       for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode=gpu_mode
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(args, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                        loc_params = [x for x in model.params
                                      if x in theano.gof.graph.inputs([nw_out])]
                        loc_args = [x for x, y in zip(cgv, model.params)
                                    if y in theano.gof.graph.inputs([nw_out])]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [ogv + final_results[param]
                           for (ogv, param) in zip(gv_args[1:], model.params)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}



        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            rtol=options['mrtol'],
            shift= -options['mreg'],
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            final_cost,
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
Example #33
0
def linear_cg(compute_Ax, b, M=None, xinit=None, rtol=1e-16, maxiter=100000, damp=0.0, floatX=None):
    """
    Solves the system A x[i] = b[i], for all i.
    
    When used as part of a Newton-CG method, b is a list of gradients, where each element of
    this list represents a gradient for a given parameter type (i.e. weight or bias of a given
    layer). This method will return a list whose elements approximates A^{-1} b[i], with the
    precision determined by maxiter or the specified tolerance level. This particular
    version implements the Polyak-Ribiere flavor of CG.

    Parameters:
    :param compute_Ax: python function which symbolically computes the matrix-vector product.
    :param b: list of T.vector, corresponding to A x[i] = b[i]
    :param M: list of T.vector (same length as b). Each element is used to precondition its
    corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements
    of A, this will implement Jacobi preconditioning.
    :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i].
    :param rtol: float. CG will stop when the norm of the residual error < rtol.
    :param maxiter: int. Maximum allowable iterations for CG.
    :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A.
    :param floatX: 'float32' or 'float64'.

    Return values:
    rval[0]: niter, number of iterations run by CG
    rval[1]: residual error norm.
    rval[2+i]: approximate value for G^-1 b[i].

    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(b)

    def loop(niter, rkp_norm, *args):
        pk = args[:n_params]
        rk = args[n_params : 2 * n_params]
        zk = args[2 * n_params : 3 * n_params]
        xk = args[-n_params:]
        A_pk_temp = compute_Ax(*pk)
        A_pk = [A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk)]
        alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk))
        alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk))
        alphak = alphak_num / alphak_denum
        xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)]
        rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)]
        if M:
            zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)]
        else:
            zkp1 = rkp1
        # compute beta_k using Polak-Ribiere
        betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum() for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1))
        betak_denum = alphak_num
        betak = betak_num / betak_denum
        pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)]
        # compute termination critera
        rkp1_norm = sum((rkp1_ ** 2).sum() for rkp1_ in rkp1)
        return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1, theano.scan_module.until(abs(rkp1_norm) < rtol)

    # Initialize residual based on xinit
    if xinit is None:
        r0_temp = b
        x0 = [tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_))) for b_ in b]
    else:
        init_Ax = compute_Ax(*xinit)
        r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))]
        x0 = [tensor.unbroadcast(tensor.shape_padleft(xinit_)) for xinit_ in xinit]

    # Leftpad r0, z0 and p0 for scan.
    r0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_)) for r0_temp_ in r0_temp]
    if M:
        z0 = [tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_)) for r0_temp_, m_ in zip(r0_temp, M)]
    else:
        z0 = r0
    p0 = z0

    states = []
    # 0 niter
    states.append(tensor.constant(npy_floatX([0])))
    # 1 residual error norm
    states.append(tensor.constant(npy_floatX([0])))

    outs, updates = scan(
        loop,
        states=states + p0 + r0 + z0 + x0,
        n_steps=maxiter,
        mode=theano.Mode(linker="c|py"),
        name="linear_conjugate_gradient",
        profile=0,
    )
    sol = [x[0] for x in outs[-n_params:]]
    niter = outs[0][0]
    rerr = outs[1][0]
    return [sol, niter, rerr]
Example #34
0
    def __init__(self,
                 options,
                 channel,
                 data,
                 model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the krylov
                    subspace
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lbfgsIters' -> int
                `krylovDim` -> int
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data
        xdata = theano.shared(data['train_x'],
                              name='xdata')
        ydata = theano.shared(data['train_y'],
                          name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        rng = numpy.random.RandomState(options['seed'])
        self.rng = rng
        self.options = options
        self.channel = channel
        self.model = model
        n_dimensions = options['krylovDim']
        self.n_dimensions = n_dimensions
        if options['device']=='gpu':
            cfn_subspaces = \
                [theano.shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [theano.shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        else:
            cfn_subspaces = \
                [TT._shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [TT._shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.cfn_subspaces = cfn_subspaces
        self.old_deltas = old_deltas

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        print 'Constructing grad function'
        loc_inputs = [x.type(name='locx') for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs,
                        [x[gdx*options['gbs']:(gdx+1)*options['gbs']] for x
                         in shared_data])
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        if options['device'] == 'gpu':
            mode=gpu_mode
            def compute_Gv(*args):
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, args))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode
            def compute_Gv(*args):
                cgv = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                     name ='cgv%d'%idx)
                           for idx, shp in enumerate(model.params_shape)]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [TT.alloc(const(0), 1, *shp)
                      for shp in model.params_shape]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone([model.train_cost,
                                                           model.preactiv_out],
                                                          replace)
                    nw_gvs = TT.Lop(nw_preactiv_out, model.params,
                                  TT.Rop(TT.grad(nw_cost, nw_preactiv_out),
                                         model.params, cgv))

                    Gvs = [ogv + ngv
                           for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                    return [gv_args[0] + const(1)] + Gvs
                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:]]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates = updates,
                                         givens = dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile = options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}



        rvals, updates = krylov_subspace(
            compute_Gv,
            self.gs,
            old_deltas,
            n_dimensions,
            model.params_shape,
            profile=options['profile'],
            device=options['device'])

        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs,
                        [x[gdx*options['mbs']:(gdx+1)*options['mbs']] for x
                         in shared_data])
        updates.update(dict(zip(cfn_subspaces, rvals)))
        self.update_krylov_subspace = theano.function(
            [gdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            profile=options['profile'],
            on_unused_input='warn',
            name='update_krylov_subspace',
            mode=mode)

        alphas = tensor.vector('alphas')
        deltas = []
        nw_params = []
        if options['device'] == 'gpu':
            params = model.params
        else:
            params = model.cpu_params

        for param, subspace in zip(params, cfn_subspaces):
            alpha_reshuffle = [0] + ['x'] * param.ndim
            delta = (alphas.dimshuffle(*alpha_reshuffle) * \
                        subspace).sum(axis=0)
            nw_param = param + delta
            nw_params.append(nw_param)
            deltas.append(delta)

        print 'constructing evaluation function'
        ebdx = TT.iscalar('ebdx')

        updates_dict = dict(zip(model.params + old_deltas,
                                nw_params + deltas))
        if options['device'] != 'gpu':
            updates_dict.update(dict(zip(model.cpu_params, nw_params)))

        self.update_params = theano.function([alphas],
                                             updates = updates_dict,
                                             name='update_params',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps +
                               nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps +
                               nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, alphas)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.zeros((1, n_dimensions),dtype='float32'))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        mode = gpu_mode,
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)

        grad_inps = zip(loc_inputs,
                        [x[ebdx*options['ebs']:(ebdx+1)*options['ebs']] for x
                         in shared_data])
        self.lbfgs_fn = theano.function([alphas, ebdx],
                                   #theano.printing.Print('fcost')(fcost),
                                    fcost,
                                   givens=grad_inps,
                                   allow_input_downcast=True,
                                   on_unused_input='warn',
                                   name='lbfgs_fn',
                                   profile=options['profile'],
                                   mode=gpu_mode)
        self.lbfgs_grad = theano.function([alphas, ebdx],
                                     fgrad,
                                     givens=grad_inps,
                                     on_unused_input='warn',
                                     allow_input_downcast=True,
                                     name='lbfgs_grad',
                                     profile=options['profile'],
                                     mode=gpu_mode)

        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                           ferr,
                           givens=dict(zip(loc_inputs, shared_data)),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])
Example #35
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              Ms=self.js,
                              rtol=options['mrtol'],
                              shift=self.damping,
                              maxit=options['miters'],
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)])
        norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)), beta_k)

        nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0, beta_k
            ],
            updates=updates,
            allow_input_downcast=True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [-r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates=dict(
                                                 zip(model.params, newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function(
            [],
            updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])),
            name='reset_dirs',
            on_unused_input='warn',
            mode=cpu_mode,
            allow_input_downcast=True,
            profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function([lr, ebdx],
                                          fcost,
                                          givens=grad_inps,
                                          allow_input_downcast=True,
                                          name='ls_cost_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.approx_change = theano.function(
            [lr],
            -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]),
            allow_input_downcast=True,
            name='approx_change',
            mode=gpu_mode,
            profile=options['profile'])

        self.ls_grad_fn = theano.function([lr, ebdx],
                                          fgrad,
                                          allow_input_downcast=True,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=cpu_mode,
                                             allow_input_downcast=True,
                                             on_unused_input='warn',
                                             profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Example #36
0
def krylov_subspace(compute_Av,
                    bs,
                    old_dir,
                    iters=20,
                    param_shapes=None,
                    profile=0,
                    device='gpu'):
    eps = numpy.float32(1e-20)
    bs = [b / tensor.sqrt((b ** 2).sum()+eps) for b in bs]
    mem_bufs = [tensor.alloc(zero, iters, *param_sh) for
           param_sh in param_shapes]
    mem_bufs = [tensor.set_subtensor(mem[0], b)
                    for mem, b in zip(mem_bufs, bs)]

    def construct_space(*args):
        vs, updates = compute_Av(*args)
        # I need to rescale at every point, otherwise if A is damping, these
        # vs go quickly to 0 and we loose the direction they represent
        norm = TT.sqrt(sum((v**2).sum() for v in vs)) + numpy.float32(1e-20)
        vs = [v / norm for v in vs]
        return vs, updates
    if device == 'gpu':
        mode = gpu_mode
    else:
        mode = cpu_mode
    outs, updates = scan(construct_space,
                   states=mem_bufs,
                   n_steps=iters - 2,
                   name='krylov_space',
                   mode=mode,
                   profile=profile)
    if not isinstance(outs, (list, tuple)):
        outs = [outs]
    outs = [tensor.set_subtensor(out[iters - 1], o)
                for out, o in zip(outs, old_dir)]
    outs = [tensor.unbroadcast(tensor.shape_padleft(x), 0)
                for x in outs]
    param_lengths = [numpy.prod(shp) for shp in param_shapes]

    def ortho(idx, *ortho_mats):
        new_ortho_mats = []
        for A, param_length in zip(ortho_mats, param_lengths):
            weight = tensor.dot(A[idx + 1:].reshape(
                (iters - idx - 1, param_length)),
                A[idx].reshape((param_length,)))
            A_reshuffle = ['x'] + list(range(A[idx].ndim))
            W_reshuffle = [0] + ['x'] * A[idx].ndim
            to_remove = weight.dimshuffle(*W_reshuffle) *\
                        A[idx].dimshuffle(*A_reshuffle)
            new_A = tensor.set_subtensor(A[idx + 1:],
                                         A[idx + 1:] - to_remove)
            x_col = new_A[idx + 1]
            x_col = x_col / tensor.sqrt((x_col ** 2).sum()+eps)
            new_A = tensor.set_subtensor(new_A[idx + 1], x_col)
            new_ortho_mats.append(new_A)
        return new_ortho_mats
    rvals, _ = scan(ortho,
                    sequences=tensor.constant(numpy.arange(iters - 1)),
                    states=outs,
                    n_steps=iters - 1,
                    name='ortho',
                    profile=profile,
                    mode=mode)
    if not isinstance(rvals, (list, tuple)):
        rvals = [rvals]
    rvals = [rval[0]*.1 for rval in rvals]
    return rvals, updates
Example #37
0
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(rng,
                     n_in=state['nins'],
                     n_hids=[state['rank_n_approx']],
                     activation=[state['rank_n_activ']],
                     init_fn=state['weight_init_fn'],
                     weight_noise=state['weight_noise'],
                     scale=state['weight_scale'],
                     name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_%d' % si))
        if state['rec_gating']:
            gater_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_%d' % si))
        if state['rec_reseting']:
            reseter_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_%d' % si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_reseter_%d' % si))

        add_rec_step.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_%d' % si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si=0,
                state_below=None,
                gater_below=None,
                reseter_below=None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0] // bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si - 1](state_below,
                                     one_step=one_step,
                                     use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si - 1](state_below,
                                               one_step=one_step,
                                               use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval = add_rec_step[si](rval,
                                    nsteps=seqlen,
                                    batch_size=bs,
                                    mask=words_mask,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        else:
            rval = add_rec_step[si](rval,
                                    mask=words_mask,
                                    state_before=prev_val,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        return rval

    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(rng,
                       n_in=state['nouts'],
                       n_hids=[state['rank_n_approx']],
                       activation=[state['rank_n_activ']],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_t_%d' % si))
        if state['rec_gating']:
            gater_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_t_%d' % si))
        if state['rec_reseting']:
            reseter_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_t_%d' % si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='proj_everything_t_%d' % si,
                       learn_bias=False))
        if state['rec_gating']:
            gater_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='gater_everything_t_%d' % si,
                           learn_bias=False))
        if state['rec_reseting']:
            reseter_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='reseter_everything_t_%d' % si,
                           learn_bias=False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_t_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_reseter_%d' % si))

        add_rec_step_t.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_t_%d' % si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim'] * state['maxout_part']],
                           activation=['lambda x: x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='encoder_proj_%d' % si,
                           learn_bias=(si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                                    indim=indim,
                                    pieces=pieces,
                                    rng=rng)

    def _add_t_op(words_embeddings,
                  everything=None,
                  words_mask=None,
                  prev_val=None,
                  one_step=False,
                  bs=1,
                  init_state=None,
                  use_noise=True,
                  gater_below=None,
                  reseter_below=None,
                  si=0,
                  state_below=None):
        seqlen = words_embeddings.out.shape[0] // bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si - 1](state_below,
                                       one_step=one_step,
                                       use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si - 1](state_below,
                                                   one_step=one_step,
                                                   use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything,
                                                one_step=one_step,
                                                use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything,
                                                  one_step=one_step,
                                                  use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](rval,
                                      nsteps=seqlen,
                                      batch_size=bs,
                                      mask=words_mask,
                                      one_step=one_step,
                                      init_state=init_state,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        else:
            rval = add_rec_step_t[si](rval,
                                      mask=words_mask,
                                      state_before=prev_val,
                                      one_step=one_step,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        return rval

    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=[state['activ']],
                           bias_scale=[state['bias']],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           name='bias_code_%d' % si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(rng,
                               n_in=word_code_nin,
                               n_hids=[outdim],
                               activation='lambda x:x',
                               bias_scale=[state['bias_mlp'] / 3],
                               scale=state['weight_scale'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               learn_bias=False,
                               name='word_code')

    proj_code = MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[outdim],
                           activation='lambda x: x',
                           bias_scale=[state['bias_mlp'] / 3],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           learn_bias=False,
                           name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[outdim],
                       activation='lambda x: x',
                       bias_scale=[state['bias_mlp'] / 3],
                       scale=state['weight_scale'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       name='proj_h_%d' % si))

    if state['bigram']:
        proj_word = MultiLayer(rng,
                               n_in=state['rank_n_approx'],
                               n_hids=[outdim],
                               activation=['lambda x:x'],
                               bias_scale=[state['bias_mlp'] / 3],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(rng,
                                indim,
                                state['nouts'],
                                state['weight_scale'],
                                -1,
                                rank_n_approx=rank_n_approx,
                                rank_n_activ=rank_n_activ,
                                weight_noise=state['weight_noise'],
                                init_fn=state['weight_init_fn'],
                                name='out')

    def _pop_op(everything,
                accum,
                everything_max=None,
                everything_min=None,
                word=None,
                aword=None,
                one_step=False,
                use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1, state['decoder_stack']):
            rval += proj_h[si](accum[si],
                               one_step=one_step,
                               use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape(
                    [rshape[0] / shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                                  one_step=one_step,
                                  use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1],
                                                   outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x)),
               x_mask,
               bs=x_mask.shape[1],
               si=0,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x)),
                   x_mask,
                   bs=x_mask.shape[1],
                   si=si,
                   state_below=encoder_acts[-1],
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True, n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape(
            [1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True, n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape(
            [shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [
        add_t_op(emb_words_t[0](emb_t(y0)),
                 everything,
                 y_mask,
                 bs=y_mask.shape[1],
                 gater_below=gater_below,
                 reseter_below=reseter_below,
                 init_state=init_state[0],
                 si=0)
    ]
    for si in xrange(1, state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(
            add_t_op(emb_words_t[si](emb_t(y0)),
                     everything,
                     y_mask,
                     bs=y_mask.shape[1],
                     state_below=has_said[-1],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     init_state=init_state[si],
                     si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword=aword)

    nll = output_layer.train(
        state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(
            y.shape[0] * y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x), use_noise=False),
               si=0,
               use_noise=False,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x), use_noise=False),
                   si=si,
                   state_below=encoder_acts[-1],
                   use_noise=False,
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]),
                                           use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(
                TT.reshape(bias_code[si](everything, use_noise=False),
                           [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x, use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0
        word_tm1 = args[aidx]
        aidx += 1
        prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1
            has_said_tm1.append(args[aidx])
        aidx += 1
        ctx = args[aidx]
        if state['avg_word']:
            aidx += 1
            awrd = args[aidx]

        val = pop_op(proj_code(ctx),
                     has_said_tm1,
                     word=word_tm1,
                     aword=awrd,
                     one_step=True,
                     use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(state_below=val.out.reshape(
            [1, TT.cast(output_layer.n_in, 'int64')]),
                                     temp=temp,
                                     target=sample.reshape([1, 1]),
                                     use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [
            add_t_op(emb_words_t[0](emb_t(sample)),
                     ctx,
                     prev_val=has_said_tm1[0],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     one_step=True,
                     use_noise=True,
                     si=0)
        ]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(
                add_t_op(emb_words_t[si](emb_t(sample)),
                         ctx,
                         prev_val=has_said_tm1[si],
                         gater_below=gater_below,
                         reseter_below=reseter_below,
                         one_step=True,
                         use_noise=True,
                         si=si,
                         state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
                            states=states,
                            params=sampler_params,
                            n_steps=n_steps,
                            name='sampler_scan')
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function([n_steps, temp, x],
                                [samples, probs.sum()],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    model = LM_Model(cost_layer=nll,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     sample_fn=sample_fn,
                     clean_before_noise_fn=False,
                     noise_fn=noise_fn,
                     indx_word=state['indx_word_target'],
                     indx_word_src=state['indx_word'],
                     character_level=False,
                     rng=rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:, idx].shape[0]):
                        print model.word_indxs_src[x[:, idx][k]],
                        if model.word_indxs_src[x[:, idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:, idx].shape[0]):
                        print model.word_indxs[y[:, idx][k]],
                        if model.word_indxs[y[:, idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:, idx])
                    if len(numpy.where(masks[:, idx] == 0)[0]) > 0:
                        senlen = numpy.where(masks[:, idx] == 0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen'] + 1, 1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen'] + 1, 1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data,
                    valid_data,
                    None,
                    model,
                    algo,
                    state,
                    channel,
                    reset=state['reset'],
                    hooks=hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word = pkl.load(open(state['word_indx'], 'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen + 1, dtype='int64')
                    for idx, sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass
Example #38
0
def linear_cg(compute_Ax,
              b,
              M=None,
              xinit=None,
              rtol=1e-16,
              maxiter=100000,
              damp=0.,
              floatX=None):
    """
    Solves the system A x[i] = b[i], for all i.
    
    When used as part of a Newton-CG method, b is a list of gradients, where each element of
    this list represents a gradient for a given parameter type (i.e. weight or bias of a given
    layer). This method will return a list whose elements approximates A^{-1} b[i], with the
    precision determined by maxiter or the specified tolerance level. This particular
    version implements the Polyak-Ribiere flavor of CG.

    Parameters:
    :param compute_Ax: python function which symbolically computes the matrix-vector product.
    :param b: list of T.vector, corresponding to A x[i] = b[i]
    :param M: list of T.vector (same length as b). Each element is used to precondition its
    corresponding element of the A-diagonal. If [Mi for Mi in M] contains the diagonal elements
    of A, this will implement Jacobi preconditioning.
    :param xinit: list of T.vector (same length as b). x[i] is initial guess for A^{-1} b[i].
    :param rtol: float. CG will stop when the norm of the residual error < rtol.
    :param maxiter: int. Maximum allowable iterations for CG.
    :param damp: float. Damping factor, equivalent to adding a term along the diagonal of A.
    :param floatX: 'float32' or 'float64'.

    Return values:
    rval[0]: niter, number of iterations run by CG
    rval[1]: residual error norm.
    rval[2+i]: approximate value for G^-1 b[i].

    Reference:
        http://en.wikipedia.org/wiki/Conjugate_gradient_method
    """
    n_params = len(b)

    def loop(niter, rkp_norm, *args):
        pk = args[:n_params]
        rk = args[n_params:2 * n_params]
        zk = args[2 * n_params:3 * n_params]
        xk = args[-n_params:]
        A_pk_temp = compute_Ax(*pk)
        A_pk = [
            A_pk_temp_ + damp * pk_ for A_pk_temp_, pk_ in zip(A_pk_temp, pk)
        ]
        alphak_num = sum((rk_ * zk_).sum() for rk_, zk_ in zip(rk, zk))
        alphak_denum = sum((A_pk_ * pk_).sum() for A_pk_, pk_ in zip(A_pk, pk))
        alphak = alphak_num / alphak_denum
        xkp1 = [xk_ + alphak * pk_ for xk_, pk_ in zip(xk, pk)]
        rkp1 = [rk_ - alphak * A_pk_ for rk_, A_pk_, in zip(rk, A_pk)]
        if M:
            zkp1 = [rkp1_ / m_ for rkp1_, m_ in zip(rkp1, M)]
        else:
            zkp1 = rkp1
        # compute beta_k using Polak-Ribiere
        betak_num = sum((zkp1_ * (rkp1_ - rk_)).sum()
                        for rkp1_, rk_, zkp1_ in zip(rkp1, rk, zkp1))
        betak_denum = alphak_num
        betak = betak_num / betak_denum
        pkp1 = [zkp1_ + betak * pk_ for zkp1_, pk_ in zip(zkp1, pk)]
        # compute termination critera
        rkp1_norm = sum((rkp1_**2).sum() for rkp1_ in rkp1)
        return [niter + 1, rkp1_norm] + pkp1 + rkp1 + zkp1 + xkp1,\
               theano.scan_module.until(abs(rkp1_norm) < rtol)

    # Initialize residual based on xinit
    if xinit is None:
        r0_temp = b
        x0 = [
            tensor.unbroadcast(tensor.shape_padleft(tensor.zeros_like(b_)))
            for b_ in b
        ]
    else:
        init_Ax = compute_Ax(*xinit)
        r0_temp = [b[i] - init_Ax[i] for i in xrange(len(b))]
        x0 = [
            tensor.unbroadcast(tensor.shape_padleft(xinit_))
            for xinit_ in xinit
        ]

    # Leftpad r0, z0 and p0 for scan.
    r0 = [
        tensor.unbroadcast(tensor.shape_padleft(r0_temp_))
        for r0_temp_ in r0_temp
    ]
    if M:
        z0 = [
            tensor.unbroadcast(tensor.shape_padleft(r0_temp_ / m_))
            for r0_temp_, m_ in zip(r0_temp, M)
        ]
    else:
        z0 = r0
    p0 = z0

    states = []
    # 0 niter
    states.append(tensor.constant(npy_floatX([0])))
    # 1 residual error norm
    states.append(tensor.constant(npy_floatX([0])))

    outs, updates = scan(loop,
                         states=states + p0 + r0 + z0 + x0,
                         n_steps=maxiter,
                         mode=theano.Mode(linker='c|py'),
                         name='linear_conjugate_gradient',
                         profile=0)
    sol = [x[0] for x in outs[-n_params:]]
    niter = outs[0][0]
    rerr = outs[1][0]
    return [sol, niter, rerr]
Example #39
0
    def __init__(
            self,
            nhids=50,
            nouts=8,
            nins=2,
            activ=TT.nnet.sigmoid,
            seed=234,
            bs=16,  # batchsize
            seqlen=3  # sequence length - fixed during training
    ):
        # 0. Keep track of arguments
        self.bs = bs
        self.nhids = nhids
        self.nouts = nouts
        self.nins = nins
        self.activ = activ
        self.seed = seed
        self.bs = bs
        self.seqlen = seqlen
        floatX = theano.config.floatX
        self.rng = numpy.random.RandomState(seed)

        # 1. Generating Theano variables
        # DenseSequence space
        # We store data as 3D tensor with (time, batch-size, nfeatures)
        self.x = TT.tensor3('x')
        # IndexSequence space
        # We store data as 1D tensor where each the dimension goes over the
        # batch size (i.e. target of each sequence in the batch)
        self.t = TT.ivector('t')  # target index for each element of batchsize
        self.inputs = [self.x, self.t]
        # Naming convention for letters after the `_`:
        # u - input
        # h - hidden
        # y - output
        # f - forward
        # b - backwards

        self.W_uhf = numpy.asarray(self.rng.normal(size=(self.nins,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=.01),
                                   dtype=floatX)
        self.W_uhb = numpy.asarray(self.rng.normal(size=(self.nins,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=.01),
                                   dtype=floatX)
        self.W_hhf = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=1),
                                   dtype=floatX)
        self.W_hhb = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nhids),
                                                   loc=0,
                                                   scale=1),
                                   dtype=floatX)
        self.W_hyf = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nouts),
                                                   loc=0,
                                                   scale=.1),
                                   dtype=floatX)
        self.W_hyb = numpy.asarray(self.rng.normal(size=(self.nhids,
                                                         self.nouts),
                                                   loc=0,
                                                   scale=.1),
                                   dtype=floatX)
        # sparsifying hidden weights (Ilya&Martens formula == ESN style
        # init)
        for dx in xrange(self.nhids):
            psng = self.rng.permutation(nhids)
            self.W_hhf[dx][psng[15:]] = 0.
            psng = self.rng.permutation(nhids)
            self.W_hhb[dx][psng[15:]] = 0.

        # Any spectral radius larger than .9 smaller than 1.1 should be fine
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf)))
        self.W_hhf = numpy.float32(.97 * self.W_hhf / sr)
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb)))
        self.W_hhb = numpy.float32(.97 * self.W_hhb / sr)
        self.b_hhf = numpy.zeros((nhids, ), dtype=floatX)
        self.b_hhb = numpy.zeros((nhids, ), dtype=floatX)
        self.b_hy = numpy.zeros((nouts, ), dtype=floatX)

        self.W_uhf = theano.shared(self.W_uhf, name='W_uhf')
        self.W_uhb = theano.shared(self.W_uhb, name='W_uhb')
        self.W_hhf = theano.shared(self.W_hhf, name='W_hhf')
        self.W_hhb = theano.shared(self.W_hhb, name='W_hhb')
        self.W_hyf = theano.shared(self.W_hyf, name='W_hyf')
        self.W_hyb = theano.shared(self.W_hyb, name='W_hyb')
        self.b_hhf = theano.shared(self.b_hhf, name='b_hhf')
        self.b_hhb = theano.shared(self.b_hhb, name='b_hhb')
        self.b_hy = theano.shared(self.b_hy, name='b_hy')

        self.params = [
            self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb, self.W_hyf,
            self.W_hyb, self.b_hhf, self.b_hhb, self.b_hy
        ]
        self.best_params = [(x.name, x.get_value()) for x in self.params]
        self.params_shape = [
            x.get_value(borrow=True).shape for x in self.params
        ]

        # 2. Constructing Theano graph
        # Note: new interface of scan asks the user to provide a memory
        # buffer that contains the initial state but which is also used
        # internally by scan to store the intermediate values of its
        # computations - hence the initial state is a 3D tensor
        h0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs,
                        self.nhids)
        h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1, self.bs,
                        self.nhids)

        # Do we use to much memory!?
        p_hf = TT.dot(self.x.reshape(
            (self.seqlen * self.bs, self.nins)), self.W_uhf) + self.b_hhf
        p_hb = TT.dot(self.x[::-1].reshape(
            (self.seqlen * self.bs, self.nins)), self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t

        # provide sequence length !? is better on GPU
        [h_f,
         h_b], _ = scan(recurrent_fn,
                        sequences=[
                            p_hf.reshape((self.seqlen, self.bs, self.nhids)),
                            p_hb.reshape((self.seqlen, self.bs, self.nhids))
                        ],
                        states=[h0_f, h0_b],
                        n_steps=self.seqlen,
                        name='bi-RNN',
                        profile=0)
        h_b = h_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        y = TT.nnet.softmax(
            TT.dot(h_f.reshape((self.seqlen * self.bs + self.bs, self.nhids
                                )), self.W_hyf) +  # Check doc flatten
            TT.dot(h_b.reshape((self.seqlen * self.bs + self.bs,
                                self.nhids)), self.W_hyb) + self.b_hy)
        my = y.reshape((self.seqlen + 1, self.bs, self.nouts)).max(axis=0)
        nll = -TT.log(my[TT.constant(numpy.arange(self.bs)), self.t])
        self.train_cost = nll.mean()
        self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.)
        ## |-----------------------------
        # - Computing metric times a vector efficiently for p(y|x)
        # Assume softmax .. we might want sigmoids though
        self.Gyvs = lambda *args:\
            TT.Lop(y, self.params,
                   TT.Rop(y, self.params, args) /\
                   (y*numpy.array(self.bs, dtype=floatX)))
        # Computing metric times a vector effciently for p(h|x)
        if activ == TT.nnet.sigmoid:
            fn = lambda x: (1 - x) * x * numpy.array(self.bs, dtype=floatX)
        elif activ == TT.tanh:
            # Please check formula !!!! It is probably wrong
            fn = lambda x: (.5 - x / 2) * (x / 2 + .5) * numpy.array(
                self.bs, dtype=floatX)
        else:  # Assume linear or piece-wise linear activation
            fn = lambda x: numpy, array(self.bs, dtype=floatX)
        self.Ghfvs = lambda *args:\
                TT.Lop(h_f, self.params,
                       TT.Rop(h_f, self.params, args) / fn(h_f))
        self.Ghbvs = lambda *args:\
                TT.Lop(h_b, self.params,
                       TT.Rop(h_b, self.params, args) / fn(h_b))
        ## ------------------ |

        vx = TT.matrix('vx')
        vt = TT.iscalar('vt')
        vh0_f = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1,
                         self.nhids)
        vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen + 1,
                         self.nhids)

        # Do we use to much memory!?
        vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf
        vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t

        # provide sequence length !? is better on GPU
        [vh_f, vh_b], _ = scan(recurrent_fn,
                               sequences=[vp_hf, vp_hb],
                               states=[vh0_f, vh0_b],
                               name='valid bi-RNN',
                               n_steps=vp_hf.shape[0],
                               profile=0)
        vh_b = vh_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        vy = TT.nnet.softmax(
            TT.dot(vh_f, self.W_hyf) + TT.dot(vh_b, self.W_hyb) + self.b_hy)
        my = TT.neq(vy.max(axis=0).argmax(), vt)
        self.validate = theano.function([vx, vt],
                                        my,
                                        name='validation',
                                        profile=0)
Example #40
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out+eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates
        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)])
        norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)),
                           beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k),
                                         TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)),
                           beta_k)

        nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0,
             beta_k],
            updates=updates,
            allow_input_downcast = True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [ -r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function([],
                                                updates=dict(zip(self.ds +
                                                                 [self.norm_d],
                                                                 nw_ds +
                                                                 [nw_normd])),
                                                name='reset_dirs',
                                                on_unused_input='warn',
                                                mode=cpu_mode,
                                                allow_input_downcast=True,
                                                profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function(
            [lr, ebdx],
            fcost,
            givens = grad_inps,
            allow_input_downcast=True,
            name='ls_cost_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.approx_change = theano.function(
                [lr],
                -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]),
                allow_input_downcast=True,
                name='approx_change',
                mode=gpu_mode,
                profile=options['profile'])


        self.ls_grad_fn = theano.function(
            [lr, ebdx],
            fgrad,
            allow_input_downcast=True,
            givens = grad_inps,
            name='ls_grad_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=cpu_mode,
                           allow_input_downcast=True,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Example #41
0
File: SGD.py Project: vd114/galatea
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        self.model = model
        # push dataset into shared var
        n_params = len(model.params)
        xdata = theano.shared(data['train_x'].astype('float32'), name='xdata')
        # ! This works for 1 of k classification
        ydata = TT.cast(
            theano.shared(data['train_y'].astype('float32'), name='ydata'),
            'int32')

        shared_data = [xdata, ydata]
        self.xdata = xdata
        self.ydata = ydata
        # all sorts of indices
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # vars for gradients
        # Store Euclidean gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients (H^-1*g)
        self.rs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            on_unused_input='warn',
            name='compute_eucledian_gradients',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        update_vals = dict(zip(model.params, nw_ps))
        #updates.update(dict(zip(model.params, nw_ps)))
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       updates=updates,
                                       on_unused_input='warn',
                                       name='eval_fn',
                                       mode=theano.Mode(linker='cvm'),
                                       profile=options['profile'])
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_vals,
            on_unused_input='warn',
            #givens=dict(grad_inps),
            name='update_params',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6

        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc, acc_train_cost):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            train_cost = TT.cast(safe_clone(model.train_cost, replace=replace),
                                 'float32')
            return [
                _idx + const(1), acc + nw_cost, acc_train_cost + train_cost
            ]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=theano.Mode(linker='cvm'),
                        profile=options['profile'])

        ferr = rvals[1][0] / const(n_steps)
        ftrain_cost = rvals[2][0] / const(n_steps)

        self.compute_error = theano.function([ebdx], [ferr, ftrain_cost],
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             on_unused_input='warn',
                                             mode=theano.Mode(linker='cvm'),
                                             profile=options['profile'])
Example #42
0
    def __init__(self,
                 nhids =50,
                 nouts = 8,
                 nins = 2,
                 activ = TT.nnet.sigmoid,
                 seed = 234,
                 bs = 16, # batchsize
                 seqlen = 3 # sequence length - fixed during training
                ):
        # 0. Keep track of arguments
        self.bs = bs
        self.nhids = nhids
        self.nouts = nouts
        self.nins = nins
        self.activ = activ
        self.seed = seed
        self.bs = bs
        self.seqlen = seqlen
        floatX = theano.config.floatX
        self.rng = numpy.random.RandomState(seed)

        # 1. Generating Theano variables
        # DenseSequence space
        # We store data as 3D tensor with (time, batch-size, nfeatures)
        self.x = TT.tensor3('x')
        # IndexSequence space
        # We store data as 1D tensor where each the dimension goes over the
        # batch size (i.e. target of each sequence in the batch)
        self.t = TT.ivector('t') # target index for each element of batchsize
        self.inputs = [self.x, self.t]
        # Naming convention for letters after the `_`:
        # u - input
        # h - hidden
        # y - output
        # f - forward
        # b - backwards

        self.W_uhf = numpy.asarray(
            self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01),
            dtype=floatX)
        self.W_uhb = numpy.asarray(
            self.rng.normal(size=(self.nins, self.nhids), loc=0, scale=.01),
            dtype=floatX)
        self.W_hhf = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1),
            dtype=floatX)
        self.W_hhb = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nhids), loc=0, scale=1),
            dtype=floatX)
        self.W_hyf = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1),
            dtype=floatX)
        self.W_hyb = numpy.asarray(
            self.rng.normal(size=(self.nhids, self.nouts), loc=0, scale=.1),
            dtype=floatX)
        # sparsifying hidden weights (Ilya&Martens formula == ESN style
        # init)
        for dx in xrange(self.nhids):
            psng = self.rng.permutation(nhids)
            self.W_hhf[dx][psng[15:]] = 0.
            psng = self.rng.permutation(nhids)
            self.W_hhb[dx][psng[15:]] = 0.

        # Any spectral radius larger than .9 smaller than 1.1 should be fine
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhf)))
        self.W_hhf = numpy.float32(.97*self.W_hhf/sr)
        sr = numpy.max(abs(numpy.linalg.eigvals(self.W_hhb)))
        self.W_hhb = numpy.float32(.97*self.W_hhb/sr)
        self.b_hhf = numpy.zeros((nhids,), dtype=floatX)
        self.b_hhb = numpy.zeros((nhids,), dtype=floatX)
        self.b_hy = numpy.zeros((nouts,), dtype=floatX)

        self.W_uhf = theano.shared(self.W_uhf, name='W_uhf')
        self.W_uhb = theano.shared(self.W_uhb, name='W_uhb')
        self.W_hhf = theano.shared(self.W_hhf, name='W_hhf')
        self.W_hhb = theano.shared(self.W_hhb, name='W_hhb')
        self.W_hyf = theano.shared(self.W_hyf, name='W_hyf')
        self.W_hyb = theano.shared(self.W_hyb, name='W_hyb')
        self.b_hhf = theano.shared(self.b_hhf, name='b_hhf')
        self.b_hhb = theano.shared(self.b_hhb, name='b_hhb')
        self.b_hy = theano.shared(self.b_hy, name='b_hy')

        self.params = [self.W_uhf, self.W_uhb, self.W_hhf, self.W_hhb,
                       self.W_hyf, self.W_hyb, self.b_hhf, self.b_hhb,
                       self.b_hy]
        self.best_params = [(x.name, x.get_value()) for x in self.params]
        self.params_shape = [x.get_value(borrow=True).shape for x in
                             self.params]

        # 2. Constructing Theano graph
        # Note: new interface of scan asks the user to provide a memory
        # buffer that contains the initial state but which is also used
        # internally by scan to store the intermediate values of its
        # computations - hence the initial state is a 3D tensor
        h0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.bs,
                              self.nhids)
        h0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.bs,
                               self.nhids)

        # Do we use to much memory!?
        p_hf = TT.dot(self.x.reshape((self.seqlen*self.bs, self.nins)), self.W_uhf) + self.b_hhf
        p_hb = TT.dot(self.x[::-1].reshape((self.seqlen*self.bs, self.nins)), self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t
        # provide sequence length !? is better on GPU
        [h_f, h_b], _ = scan(
            recurrent_fn,
            sequences = [
                p_hf.reshape((self.seqlen, self.bs, self.nhids)),
                p_hb.reshape((self.seqlen, self.bs, self.nhids))],
            states = [h0_f, h0_b],
            n_steps = self.seqlen,
            name = 'bi-RNN',
            profile = 0)
        h_b = h_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        y = TT.nnet.softmax(
            TT.dot(h_f.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyf) + # Check doc flatten
            TT.dot(h_b.reshape((self.seqlen * self.bs+self.bs, self.nhids)), self.W_hyb) +
            self.b_hy)
        my = y.reshape((self.seqlen+1, self.bs, self.nouts)).max(axis=0)
        nll = -TT.log(
            my[TT.constant(numpy.arange(self.bs)), self.t])
        self.train_cost = nll.mean()
        self.error = TT.mean(TT.neq(my.argmax(axis=1), self.t) * 100.)
        ## |-----------------------------
        # - Computing metric times a vector efficiently for p(y|x)
        # Assume softmax .. we might want sigmoids though
        self.Gyvs = lambda *args:\
            TT.Lop(y, self.params,
                   TT.Rop(y, self.params, args) /\
                   (y*numpy.array(self.bs, dtype=floatX)))
        # Computing metric times a vector effciently for p(h|x)
        if activ == TT.nnet.sigmoid:
            fn = lambda x : (1-x)*x*numpy.array(self.bs, dtype=floatX)
        elif activ == TT.tanh:
            # Please check formula !!!! It is probably wrong
            fn = lambda x:(.5-x/2)*(x/2+.5)*numpy.array(self.bs,
                                                        dtype=floatX)
        else: # Assume linear or piece-wise linear activation
            fn = lambda x: numpy,array(self.bs, dtype=floatX)
        self.Ghfvs = lambda *args:\
                TT.Lop(h_f, self.params,
                       TT.Rop(h_f, self.params, args) / fn(h_f))
        self.Ghbvs = lambda *args:\
                TT.Lop(h_b, self.params,
                       TT.Rop(h_b, self.params, args) / fn(h_b))
        ## ------------------ |

        vx = TT.matrix('vx')
        vt = TT.iscalar('vt')
        vh0_f = TT.alloc(numpy.array(0,dtype=floatX), self.seqlen+1, self.nhids)
        vh0_b = TT.alloc(numpy.array(0, dtype=floatX), self.seqlen+1, self.nhids)

        # Do we use to much memory!?
        vp_hf = TT.dot(vx, self.W_uhf) + self.b_hhf
        vp_hb = TT.dot(vx[::-1], self.W_uhb) + self.b_hhb

        def recurrent_fn(pf_t, pb_t, hf_tm1, hb_tm1):
            hf_t = activ(TT.dot(hf_tm1, self.W_hhf) + pf_t)
            hb_t = activ(TT.dot(hb_tm1, self.W_hhb) + pb_t)
            return hf_t, hb_t
        # provide sequence length !? is better on GPU
        [vh_f, vh_b], _ = scan(
            recurrent_fn,
            sequences = [vp_hf, vp_hb],
            states = [vh0_f, vh0_b],
            name = 'valid bi-RNN',
            n_steps = vp_hf.shape[0],
            profile = 0)
        vh_b = vh_b[::-1]
        # Optionally do the max over hidden layer !?
        # I'm afraid the semantics for RNN are somewhat different than MLP
        vy = TT.nnet.softmax(
            TT.dot(vh_f, self.W_hyf) +
            TT.dot(vh_b, self.W_hyb) +
            self.b_hy)
        my = TT.neq(vy.max(axis=0).argmax(), vt)
        self.validate = theano.function([vx, vt], my,
                                        name='validation',
                                        profile=0)
Example #43
0
def minres(compute_Av,
           bs,
           rtol=numpy.float32(1e-6),
           maxit=20,
           Ms=None,
           shift=numpy.float32(0.),
           maxxnorm=numpy.float32(1e15),
           Acondlim=numpy.float32(1e16),
           mode=None,
           profile=0):
    """
     DESCRIPTION:
         minres attempts to find the minimum-length and minimum-residual-norm
         solution x to the system of linear equations A*x = b or
         least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
         must be symmetric (but need not be positive definite or invertible).
         The right-hand-side column vector b must have length n.

     INPUTS:
        :param compute_Av: callable returing the symbolic expression for
            `Av`. `v` can be a set of parameteres
        :param bs: list of Theano expressions. We are looking to compute
            A^-1\dot bs
        :param rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        :param maxit: Optional, positive integer, specifies the maximum number of
            iterations. Default is 20
        :param Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        :param shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.
        p1, p2,... Optional, inputs to A and M if they are functions

     OUTPUTS:
        x       n-vector, estimated solution
        flag    integer, convergence flag
               -1  beta2 = 0.  If M = I, b and x are eigenvectors.
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9 It is a least squares problem but no converged solution yet.
        iter    integer, iteration number at which x was computed: 0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

    EXAMPLE 1:
         n = 100; on = ones(n,1); A = spdiags([-2*on 4*on -2*on],-1:1,n,n);
         b = sum(A,2); rtol = 1e-10; maxit = 50; M = spdiags(4*on,0,n,n);
         x = minresSOL69(A, b, rtol, maxit, M);

         Use this matrix-vector product function
            function y = afun(x,n)
            y = 4 * x;
            y(2:n) = y(2:n) - 2 * x(1:n-1);
            y(1:n-1) = y(1:n-1) - 2 * x(2:n);
         as input to minresSOL69
            x1 = minresSOL69(@afun, b, rtol, maxit, M);

     EXAMPLE 2: A is Laplacian on a 50 by 05 grid, singular and indefinite.
          n = 50; N = n^2; on=ones(n,1);   B = spdiags([on on on], -1:1, n, n);
          A = sparse([],[],[],N,N,(3*n-2)^2);
          for i=1:n
              A((i-1)*n+1:i*n,(i-1)*n+1:i*n) = B;
              if i*n+1 < n*n, A(i*n+1:(i+1)*n,(i-1)*n+1:i*n)=B; end;
              if (i-2)*n+1 > 0  A((i-2)*n+1:(i-1)*n,(i-1)*n+1:i*n)=B;  end;
          end
          b = sum(A,2);   rtol = 1e-5;   maxxnorm = 1e2;
          shift = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show);

     EXAMPLE 3: A is diagonal, singular and indefinite.
          h = 1;  a = -10; b = -a; n = 2*b/h + 1;
          A = spdiags((a:h:b)', 0, n, n);
          b = ones(n,1);   rtol = 1e-6;   maxxnorm = 1e2;
          shift = 0;   Acondlim = [];   show = 1;   M = [];
          x = minresSOL69( A, b, rtol, N, M, shift, maxxnorm, Acondlim, show);



     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = numpy.float32(1e-23)

    # Initialise
    flag = theano.shared(numpy.float32(0.))
    beta1 = norm(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = norm(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)
        r3s = [r3 + shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, numpy.float64(1.)),
                      r3 - (beta / betal) * r1, r3)
            for r3, r1 in zip(r3s, r1s)
        ]

        alpha = sqnorm(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = norm(r2s, r3s)
        else:
            betan = norm(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, numpy.float32(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, numpy.float32(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, numpy.float32(0.)), norm(ds),
                           TT.constant((numpy.float32(numpy.inf))))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = norm(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), numpy.float32(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, numpy.float32(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, norm(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, numpy.float32(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = numpy.float32(1) + relrnorm
        t2 = numpy.float32(1) + relArnorml
        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, numpy.float32(0.)),
                          TT.eq(flag, numpy.float32(6.))),
            TT.switch(
                TT.le(t1, numpy.float32(1.)), numpy.float32(3.),
                TT.switch(
                    TT.le(t2, numpy.float32(1.)), numpy.float32(4.),
                    TT.switch(
                        TT.le(relrnorm, rtol), numpy.float32(1.),
                        TT.switch(
                            TT.le(Anorm, numpy.float32(1e-20)),
                            numpy.float32(12),
                            TT.switch(
                                TT.le(relArnorml, rtol), numpy.float32(10.),
                                TT.switch(
                                    TT.ge(epsx, beta1), numpy.float32(5.),
                                    TT.switch(
                                        TT.ge(xnorm, maxxnorm),
                                        numpy.float32(6.),
                                        TT.switch(
                                            TT.ge(niter,
                                                  TT.cast(maxit, 'float32')),
                                            numpy.float32(8.), flag)))))))),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm),
                         numpy.float32(11.), flag)
        return [
            niter + numpy.float32(1.),
            beta,
            betan,
            phi,
            Acond,
            cs,
            dbarn,
            eplnn,
            rnorm,
            sn,
            Tnorm,
            rnorml,
            xnorm,
            Dnorm,
            gamma,
            pnorm,
            gammal,
            Axnorm,
            relrnorm,
            relArnorml,
            Anorm,
            flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag,0))

    states = []
    # 0 niter
    states.append(TT.constant(numpy.float32([0])))
    # 1 beta
    states.append(TT.constant(numpy.float32([0])))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(TT.constant(numpy.float32([1])))
    # 5 cs
    states.append(TT.constant(numpy.float32([-1])))
    # 6 dbarn
    states.append(TT.constant(numpy.float32([0])))
    # 7 eplnn
    states.append(TT.constant(numpy.float32([0])))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(TT.constant(numpy.float32([0])))
    # 10 Tnorm
    states.append(TT.constant(numpy.float32([0])))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(TT.constant(numpy.float32([0])))
    # 13 Dnorm
    states.append(TT.constant(numpy.float32([0])))
    # 14 gamma
    states.append(TT.constant(numpy.float32([0])))
    # 15 pnorm
    states.append(TT.constant(numpy.float32([0])))
    # 16 gammal
    states.append(TT.constant(numpy.float32([0])))
    # 17 Axnorm
    states.append(TT.constant(numpy.float32([0])))
    # 18 relrnorm
    states.append(TT.constant(numpy.float32([1])))
    # 19 relArnorml
    states.append(TT.constant(numpy.float32([1])))
    # 20 Anorm
    states.append(TT.constant(numpy.float32([0])))
    # 21 flag
    states.append(TT.constant(numpy.float32([0])))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, lupds = scan(loop,
                        states=states + xs + r1s + r2s + r3s + dls + ds,
                        n_steps=maxit + numpy.int32(1),
                        name='minres',
                        profile=profile,
                        mode=mode)

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22 + n_params]]
    return sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm, lupds
Example #44
0
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile=False):
    """
    WRITEME

    Part of the optimization algorithm in `scalar_search_wolfe2`.

    Parameters
    ----------
    a_lo : float
        Step size
    a_hi : float
        Step size
    phi_lo : float
        Value of f at a_lo
    phi_hi : float
        Value of f at a_hi
    derphi_lo : float
        Value of derivative at a_lo
    phi : callable
        Generates computational graph
    derphi : callable
        Generates computational graph
    phi0 : float
        Value of f at 0
    derphi0 : float
        Value of the derivative at 0
    c1 : float
        Wolfe parameter
    c2 : float
        Wolfe parameter
    profile : bool
        True if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi - a_lo
        a = TT.switch(dalpha < zero, a_hi, a_lo)
        b = TT.switch(dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1 * dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo,
                              a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2 * dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',
                         TT.isnan(a_j_quad),
                         a_j_quad > b - qchk,
                         a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX) * \
                             dalpha, a_j_quad)

        # pick between the two ..
        cond_c = lazy_or('condc',
                         TT.isnan(a_j_cubic),
                         TT.bitwise_or(a_j_cubic > b - cchk,
                                       a_j_cubic < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop',
                        TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                       phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2 * derphi0)

        cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj * (a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse(cond1,
                         phi_hi,
                         TT.switch(cond2, phi_hi, phi_lo),
                         name='phi_rec')
        a_rec = ifelse(cond1,
                       a_hi,
                       TT.switch(cond2, a_hi, a_lo),
                         name='a_rec')
        a_hi = ifelse(cond1, a_j,
                      TT.switch(cond2, a_lo, a_hi),
                      name='a_hi')
        phi_hi = ifelse(cond1, phi_aj,
                        TT.switch(cond2, phi_lo, phi_hi),
                        name='phi_hi')

        a_lo = TT.switch(cond1, a_lo, a_j)
        phi_lo = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan,
                          TT.switch(cond2, derphi_aj, nan), name='valprime')

        return ([phi_rec,
                 a_rec,
                 a_lo,
                 a_hi,
                 phi_hi,
                 phi_lo,
                 derphi_lo,
                 a_star,
                 val_star,
                 valprime],
                theano.scan_module.scan_utils.until(stop))

    maxiter = n_iters
    # cubic interpolant check
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))
    # quadratic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi - a_lo
    a = TT.switch(dalpha < zero, a_hi, a_lo)
    b = TT.switch(dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection

    # quadric interpolation
    qchk = delta2 * dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',
                     TT.isnan(a_j),
                     TT.bitwise_or(a_j > b - qchk,
                                   a_j < a + qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX) * \
                    dalpha, a_j)

    # Check new value of a_j
    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)

    cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj * (a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse(cond1,
                     phi_hi,
                     TT.switch(cond2, phi_hi, phi_lo),
                     name='mphirec')
    a_rec = ifelse(cond1,
                   a_hi,
                   TT.switch(cond2, a_hi, a_lo),
                   name='marec')
    a_hi = ifelse(cond1,
                  a_j,
                  TT.switch(cond2, a_lo, a_hi),
                  name='mahi')
    phi_hi = ifelse(cond1,
                    phi_aj,
                    TT.switch(cond2, phi_lo, phi_hi),
                    name='mphihi')

    onlyif = lazy_and('only_if',
                      TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                     phi_aj < phi_lo),
                      abs(derphi_aj) <= -c2 * derphi0)

    a_lo = TT.switch(cond1, a_lo, a_j)
    phi_lo = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # print'while_zoom'
    outs, updates = scan(while_zoom,
                         states=states,
                         n_steps=maxiter,
                         name='while_zoom',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while'
    a_star = ifelse(onlyif, a_j, outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime
Example #45
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        self.model = model
        # push dataset into shared var
        n_params = len(model.params)
        xdata = theano.shared(data['train_x'].astype('float32'),
                              name='xdata')
        # ! This works for 1 of k classification
        ydata = TT.cast(
            theano.shared(data['train_y'].astype('float32'),
                          name='ydata'), 'int32')

        shared_data = [xdata, ydata]
        self.xdata = xdata
        self.ydata = ydata
        # all sorts of indices
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # vars for gradients
        # Store Euclidean gradients
        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store riemannian gradients (H^-1*g)
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            on_unused_input='warn',
            name='compute_eucledian_gradients',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * g for p, g in zip(model.params, self.gs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        update_vals = dict(zip(model.params, nw_ps))
        #updates.update(dict(zip(model.params, nw_ps)))
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]

        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            final_cost,
            givens=dict(grad_inps),
            updates= updates,
            on_unused_input='warn',
            name='eval_fn',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_vals,
            on_unused_input='warn',
            #givens=dict(grad_inps),
            name='update_params',
            mode=theano.Mode(linker='cvm'),
            profile=options['profile'])
        self.options = options
        self.old_cost = 1e6

        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc, acc_train_cost):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err,
                                         replace=replace),'float32')
            train_cost = TT.cast(safe_clone(model.train_cost,
                                          replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost,
                    acc_train_cost + train_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=theano.Mode(linker='cvm'),
                        profile = options['profile'])

        ferr = rvals[1][0] / const(n_steps)
        ftrain_cost = rvals[2][0] / const(n_steps)

        self.compute_error = theano.function([ebdx],
                           [ferr, ftrain_cost],
                           givens=dict(grad_inps),
                           name='compute_err',
                           on_unused_input='warn',
                           mode=theano.Mode(linker='cvm'),
                           profile=options['profile'])
Example #46
0
def scalar_search_wolfe2(phi, derphi, phi0=None,
                         old_phi0=None, derphi0=None,
                         n_iters = 20,
                         c1=1e-4, c2=0.9,
                         mode=theano.Mode(linker='cvm'),
                        profile = False):
    """Find alpha that satisfies strong Wolfe conditions.

    alpha > 0 is assumed to be a descent direction.

    Parameters
    ----------
    phi : callable f(x)
        Objective scalar function.

    derphi : callable f'(x)
        Objective function derivative (can be None)
    phi0 : float, optional
        Value of phi at s=0
    old_phi0 : float, optional
        Value of phi at previous point
    derphi0 : float, optional
        Value of derphi at s=0
    c1 : float
        Parameter for Armijo condition rule.
    c2 : float
        Parameter for curvature condition rule.
    profile : flag (boolean)
        True if you want printouts of profiling information

    Returns
    -------
    alpha_star : float
        Best alpha
    phi_star
        phi at alpha_star
    phi0
        phi at 0
    derphi_star
        derphi at alpha_star

    Notes
    -----
    Uses the line search algorithm to enforce strong Wolfe
    conditions.  See Wright and Nocedal, 'Numerical Optimization',
    1999, pg. 59-60.

    For the zoom phase it uses an algorithm by [...].

    """

    if phi0 is None:
        phi0 = phi(zero)
    else:
        phi0 = phi0
    if derphi0 is None and derphi is not None:
        derphi0 = derphi(zero)
    else:
        derphi0 = derphi0

    alpha0 = zero
    alpha0.name ='alpha0'
    if old_phi0 is not None:
        alpha1 = TT.minimum(one, numpy.asarray(1.01,
                                               dtype=theano.config.floatX)* \
                            numpy.asarray(2, dtype=theano.config.floatX)*(phi0 - old_phi0)/derphi0)
    else:
        old_phi0 = nan
        alpha1   = one

    alpha1 = TT.switch(alpha1 < zero, one, alpha1)
    alpha1.name = 'alpha1'

    # This shouldn't happen. Perhaps the increment has slipped below
    # machine precision?  For now, set the return variables skip the
    # useless while loop, and raise warnflag=2 due to possible imprecision.
    phi0 = TT.switch(TT.eq(alpha1, zero), old_phi0, phi0)
    # I need a lazyif for alpha1 == 0 !!!
    phi_a1 = ifelse(TT.eq(alpha1,zero), phi0,
                    phi(alpha1), name='phi_a1')
    phi_a1.name = 'phi_a1'
    phi_a0 = phi0
    phi_a0.name = 'phi_a0'
    derphi_a0 = derphi0
    derphi_a0.name = 'derphi_a0'
    # Make sure variables are tensors otherwise strange things happen
    c1 = TT.as_tensor_variable(c1)
    c2 = TT.as_tensor_variable(c2)
    maxiter = n_iters
    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2*derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name = 'alphastar_c3'),
                      name = 'alphastar_c2'),
                      name ='alphastar_c1')

        return ( [alpha1,
                  nw_alpha1,
                  phi_a1,
                  ifelse(lazy_or('allconds',cond1, cond2, cond3),
                         phi_a1, nw_phi, name='nwphi1'),
                  ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                  i_t + one,
                  alpha_star,
                  phi_star,
                  derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3)))
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(alpha0),0)]
    states += [TT.unbroadcast(TT.shape_padleft(alpha1),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a0),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_a1),0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_a0), 0)]
    # i_t
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # alpha_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # phi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    # derphi_star
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    print 'while_search'
    outs, updates = scan(while_search,
                         states = states,
                         n_steps = maxiter,
                         name = 'while_search',
                         mode = mode,
                         profile = profile)
    print 'done_while_search'
    out3 = outs[-3][0]
    out2 = outs[-2][0]
    out1 = outs[-1][0]
    alpha_star, phi_star, derphi_star = \
            ifelse(TT.eq(alpha1, zero),
                        ( nan,phi0, nan),
                        ( out3, out2, out1), name = 'main_alphastar')
    return alpha_star, phi_star,  phi0, derphi_star
Example #47
0
def minres(compute_Av,
           bs,
           rtol=constantX(1e-6),
           maxit=20,
           Ms=None,
           shift=constantX(0.),
           maxxnorm=constantX(1e15),
           Acondlim=constantX(1e16),
           profile=0):
    """
     minres attempts to find the minimum-length and minimum-residual-norm
     solution x to the system of linear equations A*x = b or
     least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
     must be symmetric (but need not be positive definite or invertible).
     The right-hand-side column vector b must have length n.

     Parameters:

        compute_Av: callable returing the symbolic expression for
            `Av` (the product of matrix A with some vector v).
            `v` should be a list of tensors, whre the vector v means
            the vector obtain by concatenating and flattening all tensors in
            v
        bs: list of Theano expressions. We are looking to compute
            `A^-1\dot bs`.
        rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        maxit: Optional, positive integer, specifies the maximum number
            of iterations. Default is 20
        Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.

     OUTPUTS:
        x       list of Theano tensor representing the solution
        flag    theano int scalar - convergence flag
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9/10 It is a least squares problem but no converged
                 solution yet.
        iter    integer, iteration number at which x was computed:
                0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = constantX(1e-23)

    # Initialise
    flag = theano.shared(constantX(0.))
    beta1 = sqrt_inner_product(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = sqrt_inner_product(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, constantX(1.)),
                         r3 - (beta / betal) * r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, constantX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, constantX(0.)),
                        (v - epln * dl2 - dlta * dl) / gamma,
                        v)
              for v, dl2, dl in zip(vs, dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds),
                           constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2, x)
              for dl2, x in zip(dl2s, xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, constantX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)),
                          TT.eq(flag, constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)),
                            constantX(3),
                            TT.le(t2, constantX(1)),
                            constantX(4),
                            TT.le(relrnorm, rtol),
                            constantX(1),
                            TT.le(Anorm, constantX(1e-20)),
                            constantX(12),
                            TT.le(relArnorml, rtol),
                            constantX(10),
                            TT.ge(epsx, beta1),
                            constantX(5),
                            TT.ge(xnorm, maxxnorm),
                            constantX(6),
                            TT.ge(niter, TT.cast(maxit,
                                                 theano.config.floatX)),
                            constantX(8),
                            flag),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm),
                         constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))

    states = []
    # 0 niter
    states.append(constantX([0]))
    # 1 beta
    states.append(constantX([0]))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(constantX([1]))
    # 5 cs
    states.append(constantX([-1]))
    # 6 dbarn
    states.append(constantX([0]))
    # 7 eplnn
    states.append(constantX([0]))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(constantX([0]))
    # 10 Tnorm
    states.append(constantX([0]))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(constantX([0]))
    # 13 Dnorm
    states.append(constantX([0]))
    # 14 gamma
    states.append(constantX([0]))
    # 15 pnorm
    states.append(constantX([0]))
    # 16 gammal
    states.append(constantX([0]))
    # 17 Axnorm
    states.append(constantX([0]))
    # 18 relrnorm
    states.append(constantX([1]))
    # 19 relArnorml
    states.append(constantX([1]))
    # 20 Anorm
    states.append(constantX([0]))
    # 21 flag
    states.append(constantX([0]))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, loc_updates = scan(
        loop,
        states=states + xs + r1s + r2s + r3s + dls + ds,
        n_steps=maxit + numpy.int32(1),
        name='minres',
        profile=profile,
        mode=theano.Mode(linker='cvm'))
    assert isinstance(loc_updates, dict) and 'Ordered' in str(type(loc_updates))

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22: 22 + n_params]]
    return (sol,
            flag,
            niters,
            relres,
            relAres,
            Anorm,
            Acond,
            xnorm,
            Axnorm,
            loc_updates)
Example #48
0
def jobman(state, channel):
    # load dataset
    rng = numpy.random.RandomState(state['seed'])

    # declare the dimensionalies of the input and output
    if state['chunks'] == 'words':
        state['n_in'] = 10000
        state['n_out'] = 10000
    else:
        state['n_in'] = 50
        state['n_out'] = 50
    train_data, valid_data, test_data = get_text_data(state)

    ## BEGIN Tutorial
    ### Define Theano Input Variables
    x = TT.lvector('x')
    y = TT.lvector('y')
    h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))

    ### Neural Implementation of the Operators: \oplus
    #### Word Embedding
    emb_words = MultiLayer(
        rng,
        n_in=state['n_in'],
        n_hids=eval(state['inp_nhids']),
        activation=eval(state['inp_activ']),
        init_fn='sample_weights_classic',
        weight_noise=state['weight_noise'],
        rank_n_approx = state['rank_n_approx'],
        scale=state['inp_scale'],
        sparsity=state['inp_sparse'],
        learn_bias = True,
        bias_scale=eval(state['inp_bias']),
        name='emb_words')

    #### Deep Transition Recurrent Layer
    rec = eval(state['rec_layer'])(
            rng,
            eval(state['nhids']),
            activation = eval(state['rec_activ']),
            #activation = 'TT.nnet.sigmoid',
            bias_scale = eval(state['rec_bias']),
            scale=eval(state['rec_scale']),
            sparsity=eval(state['rec_sparse']),
            init_fn=eval(state['rec_init']),
            weight_noise=state['weight_noise'],
            name='rec')

    #### Stiching them together
    ##### (1) Get the embedding of a word
    x_emb = emb_words(x, no_noise_bias=state['no_noise_bias'])
    ##### (2) Embedding + Hidden State via DT Recurrent Layer
    reset = TT.scalar('reset')
    rec_layer = rec(x_emb, n_steps=x.shape[0],
                    init_state=h0*reset,
                    no_noise_bias=state['no_noise_bias'],
                    truncate_gradient=state['truncate_gradient'],
                    batch_size=1)

    ## BEGIN Exercise: DOT-RNN
    ### Neural Implementation of the Operators: \lhd

    #### Exercise (1)
    #### TODO: Define a layer from the hidden state to the intermediate layer

    #### Exercise (1)
    #### TODO: Define a layer from the input to the intermediate Layer

    #### Hidden State: Combine emb_state and emb_words_out
    #### Exercise (1)
    #### TODO: Define an activation layer

    #### Exercise (2)
    #### TODO: Define a dropout layer

    #### Softmax Layer
    output_layer = SoftmaxLayer(
        rng,
        eval(state['dout_nhid']),
        state['n_out'],
        scale=state['out_scale'],
        bias_scale=state['out_bias_scale'],
        init_fn="sample_weights_classic",
        weight_noise=state['weight_noise'],
        sparsity=state['out_sparse'],
        sum_over_time=True,
        name='out')

    ### Few Optional Things
    #### Direct shortcut from x to y
    if state['shortcut_inpout']:
        shortcut = MultiLayer(
            rng,
            n_in=state['n_in'],
            n_hids=eval(state['inpout_nhids']),
            activations=eval(state['inpout_activ']),
            init_fn='sample_weights_classic',
            weight_noise = state['weight_noise'],
            scale=eval(state['inpout_scale']),
            sparsity=eval(state['inpout_sparse']),
            learn_bias=eval(state['inpout_learn_bias']),
            bias_scale=eval(state['inpout_bias']),
            name='shortcut')

    #### Learning rate scheduling (1/(1+n/beta))
    state['clr'] = state['lr']
    def update_lr(obj, cost):
        stp = obj.step
        if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']:
            time = float(stp - obj.state['lr_start'])
            new_lr = obj.state['clr']/(1+time/obj.state['lr_beta'])
            obj.lr = new_lr
    if state['lr_adapt']:
        rec.add_schedule(update_lr)

    ### Neural Implementations of the Language Model
    #### Training
    if state['shortcut_inpout']:
        additional_inputs = [rec_layer, shortcut(x)]
    else:
        additional_inputs = [rec_layer]

    ##### Exercise (1): Compute the output intermediate layer
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer

    train_model = output_layer(outhid,
                               no_noise_bias=state['no_noise_bias'],
                               additional_inputs=additional_inputs).train(target=y,
            scale=numpy.float32(1./state['seqlen']))

    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]
    if state['carry_h0']:
        train_model.updates += [(h0, nw_h0)]

    #### Validation
    h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32'))
    rec_layer = rec(emb_words(x, use_noise=False),
                    n_steps = x.shape[0],
                    batch_size=1,
                    init_state=h0val*reset,
                    use_noise=False)
    nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1]

    ##### Exercise (1):
    ##### TODO: Compute the output intermediate layer

    ##### Exercise (2): Apply Dropout
    ##### TODO: Apply the dropout layer without noise

    if state['shortcut_inpout']:
        additional_inputs=[rec_layer, shortcut(x, use_noise=False)]
    else:
        additional_inputs=[rec_layer]
    valid_model = output_layer(outhid,
            additional_inputs=additional_inputs,
            use_noise=False).validate(target=y, sum_over_time=True)

    valid_updates = []
    if state['carry_h0']:
        valid_updates = [(h0val, nw_h0)]

    valid_fn = theano.function([x,y, reset], valid_model.out,
          name='valid_fn', updates=valid_updates)

    #### Sampling
    ##### single-step sampling
    def sample_fn(word_tm1, h_tm1):
        x_emb = emb_words(word_tm1, use_noise = False, one_step=True)
        h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1]
        outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) +
            emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), 
            use_noise=False, one_step=True)
        word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.)
        return word, h0

    ##### scan for iterating the single-step sampling multiple times
    [samples, summaries], updates = scan(sample_fn,
                      states = [
                          TT.alloc(numpy.int64(0), state['sample_steps']),
                          TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])],
                      n_steps= state['sample_steps'],
                      name='sampler_scan')

    ##### build a Theano function for sampling
    sample_fn = theano.function([], [samples],
        updates=updates, profile=False, name='sample_fn')

    ##### Load a dictionary
    dictionary = numpy.load(state['dictionary'])
    if state['chunks'] == 'chars':
        dictionary = dictionary['unique_chars']
    else:
        dictionary = dictionary['unique_words']
    def hook_fn():
        sample = sample_fn()[0]
        print 'Sample:',
        if state['chunks'] == 'chars':
            print "".join(dictionary[sample])
        else:
            for si in sample:
                print dictionary[si],
            print

    ### Build and Train a Model
    #### Define a model
    model = LM_Model(
        cost_layer = train_model,
        weight_noise_amount=state['weight_noise_amount'],
        valid_fn = valid_fn,
        clean_before_noise_fn = False,
        noise_fn = None,
        rng = rng)

    if state['reload']:
        model.load(state['prefix']+'model.npz')

    #### Define a trainer
    ##### Training algorithm (SGD)
    if state['moment'] < 0:
        algo = SGD(model, state, train_data)
    else:
        algo = SGD_m(model, state, train_data)
    ##### Main loop of the trainer
    main = MainLoop(train_data,
                    valid_data,
                    test_data,
                    model,
                    algo,
                    state,
                    channel,
                    train_cost = False,
                    hooks = hook_fn,
                    validate_postprocess =  eval(state['validate_postprocess']))
    ## Run!
    main.main()
Example #49
0
def minres(compute_Av,
           bs,
           rtol=constantX(1e-6),
           maxit=20,
           Ms=None,
           shift=constantX(0.),
           maxxnorm=constantX(1e15),
           Acondlim=constantX(1e16),
           profile=0):
    """
     minres attempts to find the minimum-length and minimum-residual-norm
     solution x to the system of linear equations A*x = b or
     least squares problem min||Ax-b||.  The n-by-n coefficient matrix A
     must be symmetric (but need not be positive definite or invertible).
     The right-hand-side column vector b must have length n.

     Parameters:

        compute_Av: callable returing the symbolic expression for
            `Av` (the product of matrix A with some vector v).
            `v` should be a list of tensors, whre the vector v means
            the vector obtain by concatenating and flattening all tensors in
            v
        bs: list of Theano expressions. We are looking to compute
            `A^-1\dot bs`.
        rtol: Optional, real, specifies the tolerance of the method.
            Default is 1e-6
        maxit: Optional, positive integer, specifies the maximum number
            of iterations. Default is 20
        Ms: List of theano expression of same shape as `bs`. The
            method uses these to precondition with diag(Ms)
        shift: Optional, scalar, real or complex.  Default is 0.
                   Effectively solve the system (A - shift I) * x = b.
        maxxnorm   real positive, maximum bound on NORM(x). Default is 1e14.
        Acondlim   real positive, maximum bound on COND(A). Default is 1e15.
        show       boolean, 0 to suppress outputs, 1 to show iterations.
                   Default is 0.

     OUTPUTS:
        x       list of Theano tensor representing the solution
        flag    theano int scalar - convergence flag
                0 beta1 = 0.  The exact solution is  x = 0.
                1 A solution to (poss. singular) Ax = b found, given rtol.
                2 Pseudoinverse solution for singular LS problem, given rtol.
                3 A solution to (poss. singular) Ax = b found, given eps.
                4 Pseudoinverse solution for singular LS problem, given eps.
                5 x has converged to an eigenvector.
                6 xnorm has exceeded maxxnorm.
                7 Acond has exceeded Acondlim.
                8 The iteration limit was reached.
                9/10 It is a least squares problem but no converged
                 solution yet.
        iter    integer, iteration number at which x was computed:
                0 <= iter <= maxit.
        relres  real positive, the relative residual is defined as
                     NORM(b-A*x)/(NORM(A) * NORM(x) + NORM(b)),
                computed recurrently here.  If flag is 1 or 3,  relres <= TOL.
        relAres real positive, the relative-NORM(Ar) := NORM(Ar) / NORM(A) ---
                computed recurrently here. If flag is 2 or 4, relAres <= TOL.
        Anorm   real positive, estimate of matrix 2-norm of A.
        Acond   real positive, estimate of condition number of A with
                respect to 2-norm.
        xnorm   non-negative positive, recurrently computed NORM(x)
        Axnorm  non-negative positive, recurrently computed NORM(A * x).

     REFERENCES:
        Sou-Cheng Choi's PhD Dissertation, Stanford University, 2006.
             http://www.stanford.edu/group/SOL/software.html

    """

    if not isinstance(bs, (tuple, list)):
        bs = [bs]
        return_as_list = False
    else:
        bs = list(bs)
        return_as_list = True

    eps = constantX(1e-23)

    # Initialise
    beta1 = sqrt_inner_product(bs)

    #------------------------------------------------------------------
    # Set up p and v for the first Lanczos vector v1.
    # p  =  beta1 P' v1,  where  P = C**(-1).
    # v is really P' v1.
    #------------------------------------------------------------------
    r3s = [b for b in bs]
    r2s = [b for b in bs]
    r1s = [b for b in bs]
    if Ms is not None:
        r3s = [b / m for b, m in zip(bs, Ms)]
        beta1 = sqrt_inner_product(r3s, bs)
    #------------------------------------------------------------------
    ## Initialize other quantities.
    # Note that Anorm has been initialized by IsOpSym6.
    # ------------------------------------------------------------------
    bnorm = beta1
    n_params = len(bs)

    def loop(niter, beta, betan, phi, Acond, cs, dbarn, eplnn, rnorm, sn,
             Tnorm, rnorml, xnorm, Dnorm, gamma, pnorm, gammal, Axnorm,
             relrnorm, relArnorml, Anorm, flag, *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params:1 * n_params]
        r1s = args[1 * n_params:2 * n_params]
        r2s = args[2 * n_params:3 * n_params]
        r3s = args[3 * n_params:4 * n_params]
        dls = args[4 * n_params:5 * n_params]
        ds = args[5 * n_params:6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [
            TT.switch(TT.ge(niter, constantX(1.)), r3 - (beta / betal) * r1,
                      r3) for r3, r1 in zip(r3s, r1s)
        ]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(
            TT.eq(niter, constantX(0.)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
            TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) + TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [
            TT.switch(TT.neq(gamma, constantX(0.)),
                      (v - epln * dl2 - dlta * dl) / gamma, v)
            for v, dl2, dl in zip(vs, dl2s, dls)
        ]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds), constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [
            TT.switch(TT.ge(xnorm, maxxnorm), dl2, x)
            for dl2, x in zip(dl2s, xs)
        ]

        flag = TT.switch(TT.ge(xnorm, maxxnorm), constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6, TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6, rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(
            flag_no_6,
            TT.switch(
                TT.eq(niter, constantX(0.)),
                TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                TT.sqrt(
                    TT.sqr(Tnorm) + TT.sqr(beta) + TT.sqr(alpha) +
                    TT.sqr(betan))), Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)), TT.eq(flag,
                                                           constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)), constantX(3),
                            TT.le(t2, constantX(1)), constantX(4),
                            TT.le(relrnorm, rtol), constantX(1),
                            TT.le(Anorm, constantX(1e-20)), constantX(12),
                            TT.le(relArnorml, rtol), constantX(10),
                            TT.ge(epsx, beta1), constantX(5),
                            TT.ge(xnorm, maxxnorm), constantX(6),
                            TT.ge(niter, TT.cast(maxit, theano.config.floatX)),
                            constantX(8), flag), flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm), constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))

    states = []
    # 0 niter
    states.append(constantX([0]))
    # 1 beta
    states.append(constantX([0]))
    # 2 betan
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 3 phi
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 4 Acond
    states.append(constantX([1]))
    # 5 cs
    states.append(constantX([-1]))
    # 6 dbarn
    states.append(constantX([0]))
    # 7 eplnn
    states.append(constantX([0]))
    # 8 rnorm
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 9 sn
    states.append(constantX([0]))
    # 10 Tnorm
    states.append(constantX([0]))
    # 11 rnorml
    states.append(TT.unbroadcast(TT.shape_padleft(beta1), 0))
    # 12 xnorm
    states.append(constantX([0]))
    # 13 Dnorm
    states.append(constantX([0]))
    # 14 gamma
    states.append(constantX([0]))
    # 15 pnorm
    states.append(constantX([0]))
    # 16 gammal
    states.append(constantX([0]))
    # 17 Axnorm
    states.append(constantX([0]))
    # 18 relrnorm
    states.append(constantX([1]))
    # 19 relArnorml
    states.append(constantX([1]))
    # 20 Anorm
    states.append(constantX([0]))
    # 21 flag
    states.append(constantX([0]))
    xs = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    ds = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    dls = [TT.unbroadcast(TT.shape_padleft(TT.zeros_like(b)), 0) for b in bs]
    r1s = [TT.unbroadcast(TT.shape_padleft(r1), 0) for r1 in r1s]
    r2s = [TT.unbroadcast(TT.shape_padleft(r2), 0) for r2 in r2s]
    r3s = [TT.unbroadcast(TT.shape_padleft(r3), 0) for r3 in r3s]

    rvals, loc_updates = scan(loop,
                              states=states + xs + r1s + r2s + r3s + dls + ds,
                              n_steps=maxit + numpy.int32(1),
                              name='minres',
                              profile=profile,
                              mode=theano.Mode(linker='cvm'))
    assert isinstance(loc_updates, dict) and 'Ordered' in str(
        type(loc_updates))

    niters = TT.cast(rvals[0][0], 'int32')
    flag = TT.cast(rvals[21][0], 'int32')
    relres = rvals[18][0]
    relAres = rvals[19][0]
    Anorm = rvals[20][0]
    Acond = rvals[4][0]
    xnorm = rvals[12][0]
    Axnorm = rvals[17][0]
    sol = [x[0] for x in rvals[22:22 + n_params]]
    return (sol, flag, niters, relres, relAres, Anorm, Acond, xnorm, Axnorm,
            loc_updates)
Example #50
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store riemannian gradients
        self.rs1 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        self.rs2 = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]
        # Store jacobi diagonal
        self.js = [
            theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
            for shp in model.params_shape
        ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape, nstreams=128))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, shared_data)]

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        self.damping = theano.shared(numpy.float32(options['mreg']))
        # Step 2.1 Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gf_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gf_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs1, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients1 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 2.2 Compile function for Computing Riemannian gradients
        rbpos = rbdx * options['mbs']
        mode = gpu_mode

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.gc_outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs,
                                                model.gc_outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

                nw_cost, nw_preactiv_out = safe_clone(
                    [model.train_cost, model.preactiv_out], replace)
                nw_gvs = TT.Lop(
                    nw_preactiv_out, model.params,
                    TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                           args))

                Gvs = [ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            #_final_Gvs = [x + self.damping * y
            #        for x,y in zip(final_Gvs, args)]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            #Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs2, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients2 = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        if options['rsch'] == 1:
            self.rs = self.rs1
        else:
            self.rs = self.rs2

        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, shared_data)]
        denom = -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.rs)])
        self.approx_change = theano.function([lr],
                                             denom,
                                             name='approx_change',
                                             mode=gpu_mode,
                                             allow_input_downcast=True,
                                             profile=options['profile'])

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       allow_input_downcast=True,
                                       mode=gpu_mode,
                                       profile=options['profile'])

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        self.grad_lr_fn = theano.function([ebdx, lr],
                                          fgrad,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          on_unused_input='warn',
                                          mode=gpu_mode,
                                          allow_input_downcast=True,
                                          profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Example #51
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the krylov
                    subspace
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lbfgsIters' -> int
                `krylovDim` -> int
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data
        xdata = theano.shared(data['train_x'], name='xdata')
        ydata = theano.shared(data['train_y'], name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]
        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        rng = numpy.random.RandomState(options['seed'])
        self.rng = rng
        self.options = options
        self.channel = channel
        self.model = model
        n_dimensions = options['krylovDim']
        self.n_dimensions = n_dimensions
        if options['device'] == 'gpu':
            cfn_subspaces = \
                [theano.shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [theano.shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            cfn_subspaces = \
                [TT._shared(numpy.zeros(
                                (n_dimensions,) + shp, dtype='float32'),
                               name='cfn{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in enumerate(zip(model.params_shape,
                                                      model.params))]
            old_deltas = \
                [TT._shared(numpy.zeros(shp, dtype='float32'),
                               name='delta{%s|%d}' % (str(param.name), i))
                 for i, (shp, param) in
                            enumerate(zip(model.params_shape, model.params))]
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        self.cfn_subspaces = cfn_subspaces
        self.old_deltas = old_deltas

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        print 'Constructing grad function'
        loc_inputs = [x.type(name='locx') for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        updates.update(dict(zip(self.gs, nw_gs)))
        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs, [
            x[gdx * options['gbs']:(gdx + 1) * options['gbs']]
            for x in shared_data
        ])
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               cgv))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        rvals, updates = krylov_subspace(compute_Gv,
                                         self.gs,
                                         old_deltas,
                                         n_dimensions,
                                         model.params_shape,
                                         profile=options['profile'],
                                         device=options['device'])

        gdx = TT.iscalar('gdx')
        grad_inps = zip(loc_inputs, [
            x[gdx * options['mbs']:(gdx + 1) * options['mbs']]
            for x in shared_data
        ])
        updates.update(dict(zip(cfn_subspaces, rvals)))
        self.update_krylov_subspace = theano.function(
            [gdx], [],
            updates=updates,
            givens=dict(grad_inps),
            profile=options['profile'],
            on_unused_input='warn',
            name='update_krylov_subspace',
            mode=mode)

        alphas = tensor.vector('alphas')
        deltas = []
        nw_params = []
        if options['device'] == 'gpu':
            params = model.params
        else:
            params = model.cpu_params

        for param, subspace in zip(params, cfn_subspaces):
            alpha_reshuffle = [0] + ['x'] * param.ndim
            delta = (alphas.dimshuffle(*alpha_reshuffle) * \
                        subspace).sum(axis=0)
            nw_param = param + delta
            nw_params.append(nw_param)
            deltas.append(delta)

        print 'constructing evaluation function'
        ebdx = TT.iscalar('ebdx')

        updates_dict = dict(zip(model.params + old_deltas, nw_params + deltas))
        if options['device'] != 'gpu':
            updates_dict.update(dict(zip(model.cpu_params, nw_params)))

        self.update_params = theano.function([alphas],
                                             updates=updates_dict,
                                             name='update_params',
                                             allow_input_downcast=True,
                                             mode=mode,
                                             profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + nw_params))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, alphas)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.zeros((1, n_dimensions), dtype='float32'))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        mode=gpu_mode,
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)

        grad_inps = zip(loc_inputs, [
            x[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]
            for x in shared_data
        ])
        self.lbfgs_fn = theano.function(
            [alphas, ebdx],
            #theano.printing.Print('fcost')(fcost),
            fcost,
            givens=grad_inps,
            allow_input_downcast=True,
            on_unused_input='warn',
            name='lbfgs_fn',
            profile=options['profile'],
            mode=gpu_mode)
        self.lbfgs_grad = theano.function([alphas, ebdx],
                                          fgrad,
                                          givens=grad_inps,
                                          on_unused_input='warn',
                                          allow_input_downcast=True,
                                          name='lbfgs_grad',
                                          profile=options['profile'],
                                          mode=gpu_mode)

        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([],
                                             ferr,
                                             givens=dict(
                                                 zip(loc_inputs, shared_data)),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Example #52
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            # Store eucledian gradients
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              rtol=options['mrtol'],
                              shift=-options['mreg'],
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Example #53
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        eps = numpy.float32(1e-24)
        xdata = theano.shared(data['train_x'],
                              name='xdata')
        ydata = theano.shared(data['train_y'],
                           name='ydata')
        self.xdata = xdata
        self.ydata = ydata
        shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']

        # Store eucledian gradients
        self.gs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store riemannian gradients
        self.rs = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]
        # Store jacobi diagonal
        self.js = [theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                   for shp in model.params_shape]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'

        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]

            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              mode=gpu_mode,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, shared_data)]


        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])
        #theano.printing.pydotprint(self.compute_eucledian_gradients,
        #        'eucledian_grad', scan_graphs=True)

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']
        self.damping = theano.shared(numpy.float32(options['mreg']))
        mode=gpu_mode
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])
                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates


        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))

        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift= self.damping,
            maxit=options['miters'],
            mode=mode,
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))


        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:
                           (rbdx + 1) * options['mbs']])
                     for x,y in zip(loc_inputs, shared_data)]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc0,acc1):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_cost2 = safe_clone(model.train_cost, replace =
                                  dict(zip(model.inputs, nw_inps)))
            return [_idx + const(1),
                    acc0 + nw_cost,
                    acc1 + nw_cost2]

        acc0 = const([0])
        acc1 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0, acc1],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1].sum() / const(n_steps)
        cost0 = rvals[2].sum() / const(n_steps)
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, shared_data)]

        denom = -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.rs)])
        rho = (final_cost - cost0) / denom
        print 'compling evaluation function'
        self.eval_fn = theano.function(
            [ebdx, lr],
            [final_cost, rho],
            givens=dict(grad_inps),
            on_unused_input='warn',
            updates = updates,
            name='eval_fn',
            mode=gpu_mode,
            profile=options['profile'])


        update_dict = dict(zip(model.params, nw_ps))
        self.update_params = theano.function(
            [lr],
            [],
            updates=update_dict,
            name='update_params',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])
        self.options = options
        self.old_cost = numpy.inf
        n_steps = options['ebs'] // options['cbs']
        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=gpu_mode,
                           on_unused_input='warn',
                           profile=options['profile'])