Beispiel #1
0
def matrix_weight_grad_calculator(xs, es, kp_x, kd_x, kp_e, kd_e, shapes, epsilon=1e-7):
    """
    :param xs:
    :param es:
    :param kp_x:
    :param kd_x:
    :param kp_e:
    :param kd_e:
    :param shapes:
    :param epsilon:
    :return:
    """
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_samples, n_in, n_out = shapes
    v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out)))
    rx = kd_x/(kp_x+kd_x)
    re = kd_e/(kp_e+kd_e)
    xr = create_shared_variable(np.zeros((n_samples, n_in)))
    er = create_shared_variable(np.zeros((n_samples, n_out)))

    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    xr_decayed = xr*rx
    er_decayed = er*re
    spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :])
    v2 = xr_decayed[:, :, None]*er_decayed[:, None, :]
    dws = (spikes*(v2-v1))/(rx*re-1)
    new_xr = xr_decayed + xs/(kp_x+kd_x)
    new_er = er_decayed + es/(kp_e+kd_e)

    add_update(v1, tt.switch(spikes, new_xr[:, :, None]*new_er[:, None, :], v1))
    add_update(xr, new_xr)
    add_update(er, new_er)

    return dws.sum(axis=0)
Beispiel #2
0
    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0**2 * alpha1**2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                          one - alpha2 / alpha1 < 0.96), alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)
Beispiel #3
0
    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2 * derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1, c2,
                     profile=profile)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name='alphastar_c3'),
                      name='alphastar_c2'),
                      name='alphastar_c1')

        return ([alpha1,
                 nw_alpha1,
                 phi_a1,
                 ifelse(lazy_or('allconds',
                                cond1,
                                cond2,
                                cond3),
                        phi_a1,
                        nw_phi,
                        name='nwphi1'),
                 ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                 i_t + one,
                 alpha_star,
                 phi_star,
                 derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',
                            TT.eq(nw_alpha1, zero),
                            cond1,
                            cond2,
                            cond3)))
Beispiel #4
0
    def armijo(alpha0, alpha1, phi_a0, phi_a1):
        factor = alpha0 ** 2 * alpha1 ** 2 * (alpha1 - alpha0)
        a = alpha0 ** 2 * (phi_a1 - phi0 - derphi0 * alpha1) - \
            alpha1 ** 2 * (phi_a0 - phi0 - derphi0 * alpha0)
        a = a / factor
        b = -alpha0 ** 3 * (phi_a1 - phi0 - derphi0 * alpha1) + \
            alpha1 ** 3 * (phi_a0 - phi0 - derphi0 * alpha0)
        b = b / factor

        alpha2 = (-b + TT.sqrt(abs(b ** 2 - 3 * a * derphi0))) / (3.0 * a)
        phi_a2 = phi(alpha2)

        end_condition = phi_a2 <= phi0 + c1 * alpha2 * derphi0
        end_condition = TT.bitwise_or(
            TT.isnan(alpha2), end_condition)
        end_condition = TT.bitwise_or(
            TT.isinf(alpha2), end_condition)
        alpha2 = TT.switch(
            TT.bitwise_or(alpha1 - alpha2 > alpha1 / constant(2.),
                  one - alpha2 / alpha1 < 0.96),
            alpha1 / constant(2.),
            alpha2)
        return [alpha1, alpha2, phi_a1, phi_a2], \
                theano.scan_module.until(end_condition)
Beispiel #5
0
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e,
                                         shapes):
    """
    Do an efficient update of the weights given the two spike-trains.

    This isn't actually implemented as an efficient update, but it will produce the identical result as if it were.

    :param xs: An (n_samples, n_in) array
    :param es: An (n_samples, n_out) array
    :param kp_x: kp for the x units
    :param kd_x: kd for the x units
    :param kp_e: kp for the e units
    :param kd_e: kd for the e units
    :param shapes: (minibatch_size, n_in, n_out)
    :return: An (n_in, n_out) approximate weight gradient.
    """
    # TODO: RESOLVE INSTABILITY ISSUE
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_samples, n_in, n_out = shapes
    rx = kd_x / (kp_x + kd_x)
    re = kd_e / (kp_e + kd_e)

    tx_last = create_shared_variable(np.zeros((n_samples, n_in)))
    te_last = create_shared_variable(np.zeros((n_samples, n_out)))
    xr = create_shared_variable(np.zeros((n_samples, n_in)))
    er = create_shared_variable(np.zeros((n_samples, n_out)))

    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :])
    sum_to_last = geoseries_sum(
        rx * re, t_start=t_last, t_end=0
    )  # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter

    spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :])
    dw_es = (
        xr[:, :, None] * er[:, None, :] * spikes
    ) * sum_to_last  # PROBLEM HERE!!!! Can be very small number times very large numen
    # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last
    # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last
    add_update(xr, xr * rx + xs / (kp_x + kd_x))
    add_update(er, er * re + es / (kp_e + kd_e))
    add_update(tx_last, tt.switch(x_spikes, 0, tx_last - 1))
    add_update(te_last, tt.switch(e_spikes, 0, te_last - 1))

    return dw_es.sum(axis=0)
Beispiel #6
0
def matrix_weight_grad_calculator(xs,
                                  es,
                                  kp_x,
                                  kd_x,
                                  kp_e,
                                  kd_e,
                                  shapes,
                                  epsilon=1e-7):
    """
    :param xs:
    :param es:
    :param kp_x:
    :param kd_x:
    :param kp_e:
    :param kd_e:
    :param shapes:
    :param epsilon:
    :return:
    """
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_samples, n_in, n_out = shapes
    v1 = create_shared_variable(np.zeros((n_samples, n_in, n_out)))
    rx = kd_x / (kp_x + kd_x)
    re = kd_e / (kp_e + kd_e)
    xr = create_shared_variable(np.zeros((n_samples, n_in)))
    er = create_shared_variable(np.zeros((n_samples, n_out)))

    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    xr_decayed = xr * rx
    er_decayed = er * re
    spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :])
    v2 = xr_decayed[:, :, None] * er_decayed[:, None, :]
    dws = (spikes * (v2 - v1)) / (rx * re - 1)
    new_xr = xr_decayed + xs / (kp_x + kd_x)
    new_er = er_decayed + es / (kp_e + kd_e)

    add_update(v1,
               tt.switch(spikes, new_xr[:, :, None] * new_er[:, None, :], v1))
    add_update(xr, new_xr)
    add_update(er, new_er)

    return dws.sum(axis=0)
Beispiel #7
0
    def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t,
                    alpha_star, phi_star, derphi_star):
        derphi_a1 = derphi(alpha1)
        cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0,
                              TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero))
        cond2 = abs(derphi_a1) <= -c2*derphi0
        cond3 = derphi_a1 >= zero
        alpha_star_c1, phi_star_c1, derphi_star_c1 = \
                _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0,
                      phi, derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        alpha_star_c3, phi_star_c3, derphi_star_c3 = \
                _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi,
                      derphi, phi0, derphi0, c1,c2,
                     profile = profile, mode=mode)
        nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX)
        nw_phi = phi(nw_alpha1)
        alpha_star, phi_star, derphi_star = \
                ifelse(cond1,
                          (alpha_star_c1, phi_star_c1, derphi_star_c1),
                ifelse(cond2,
                          (alpha1, phi_a1, derphi_a1),
                ifelse(cond3,
                          (alpha_star_c3, phi_star_c3, derphi_star_c3),
                           (nw_alpha1, nw_phi, nan),
                      name = 'alphastar_c3'),
                      name = 'alphastar_c2'),
                      name ='alphastar_c1')

        return ( [alpha1,
                  nw_alpha1,
                  phi_a1,
                  ifelse(lazy_or('allconds',cond1, cond2, cond3),
                         phi_a1, nw_phi, name='nwphi1'),
                  ifelse(cond1, derphi_a0, derphi_a1, name='derphi'),
                  i_t + one,
                  alpha_star,
                  phi_star,
                  derphi_star],
                theano.scan_module.scan_utils.until(
                    lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3)))
Beispiel #8
0
def past_weight_grad_calculator_reloaded(xs, es, kp_x, kd_x, kp_e, kd_e, shapes):
    """
    Do an efficient update of the weights given the two spike-trains.

    This isn't actually implemented as an efficient update, but it will produce the identical result as if it were.

    :param xs: An (n_samples, n_in) array
    :param es: An (n_samples, n_out) array
    :param kp_x: kp for the x units
    :param kd_x: kd for the x units
    :param kp_e: kp for the e units
    :param kd_e: kd for the e units
    :param shapes: (minibatch_size, n_in, n_out)
    :return: An (n_in, n_out) approximate weight gradient.
    """
    # TODO: RESOLVE INSTABILITY ISSUE
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_samples, n_in, n_out = shapes
    rx = kd_x/(kp_x+kd_x)
    re = kd_e/(kp_e+kd_e)

    tx_last = create_shared_variable(np.zeros((n_samples, n_in)))
    te_last = create_shared_variable(np.zeros((n_samples, n_out)))
    xr = create_shared_variable(np.zeros((n_samples, n_in)))
    er = create_shared_variable(np.zeros((n_samples, n_out)))

    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    t_last = tt.maximum(tx_last[:, :, None], te_last[:, None, :])
    sum_to_last = geoseries_sum(rx*re, t_start=t_last, t_end=0)  # Wasteful, since most of this is multiplied by zeros later, but for now it don't matter

    spikes = tt.bitwise_or(x_spikes[:, :, None], e_spikes[:, None, :])
    dw_es = (xr[:, :, None]*er[:, None, :]*spikes)*sum_to_last  # PROBLEM HERE!!!! Can be very small number times very large numen
    # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last
    # dw_es = (xr[:, :, None]*(x_spikes[:, :, None]-x_spikes[:, :, None]*e_spikes[:, None, :]) * er[:, None, :] + xr[:, :, None] * (er*e_spikes)[:, None, :]) * sum_to_last
    add_update(xr, xr*rx + xs/(kp_x+kd_x))
    add_update(er, er*re + es/(kp_e+kd_e))
    add_update(tx_last, tt.switch(x_spikes, 0, tx_last-1))
    add_update(te_last, tt.switch(e_spikes, 0, te_last-1))

    return dw_es.sum(axis=0)
Beispiel #9
0
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile = False,
          mode=theano.Mode(linker='cvm')):
    """
    TODO: re-write me

    Part of the optimization algorithm in `scalar_search_wolfe2`.
    a_lo : scalar (step size)
    a_hi : scalar (step size)
    phi_lo : scalar (value of f at a_lo)
    phi_hi : scalar ( value of f at a_hi)
    derphi_lo : scalar ( value of derivative at a_lo)
    phi : callable -> generates computational graph
    derphi: callable -> generates computational graph
    phi0 : scalar ( value of f at 0)
    derphi0 : scalar (value of the derivative at 0)
    c1 : scalar  (wolfe parameter)
    c2 : scalar  (wolfe parameter)
    profile: if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi-a_lo
        a = TT.switch( dalpha < zero, a_hi, a_lo)
        b = TT.switch( dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1*dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2*dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad)


        # pick between the two ..
        cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b -
                                                            cchk, a_j_cubic
                                                            < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                         phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2*derphi0)


        cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj*(a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse( cond1, phi_hi,
                            TT.switch( cond2, phi_hi, phi_lo), name =
                         'phi_rec')
        a_rec   = ifelse( cond1, a_hi,
                            TT.switch( cond2, a_hi, a_lo), name='a_rec')
        a_hi    = ifelse( cond1, a_j,
                            TT.switch( cond2, a_lo, a_hi), name='a_hi')
        phi_hi  = ifelse( cond1, phi_aj,
                            TT.switch( cond2, phi_lo, phi_hi), name='phi_hi')

        a_lo      = TT.switch(cond1, a_lo, a_j)
        phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj,
                                                  nan), name='valprime')

        return ( [ phi_rec,
                  a_rec,
                  a_lo,
                  a_hi,
                  phi_hi,
                  phi_lo,
                  derphi_lo,
                  a_star,
                  val_star,
                  valprime],
                theano.scan_module.scan_utils.until(stop) )

    maxiter = n_iters
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))  # cubic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))  # quadratic interpolant check
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi-a_lo
    a = TT.switch( dalpha < zero, a_hi, a_lo)
    b = TT.switch( dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection


    # quadric interpolation
    qchk = delta2*dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a +
                                                  qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j)


    # Check new value of a_j

    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)



    cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj*(a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse( cond1, phi_hi,
                        TT.switch( cond2, phi_hi, phi_lo), name='mphirec')
    a_rec   = ifelse( cond1, a_hi,
                        TT.switch( cond2, a_hi, a_lo), name='marec')
    a_hi    = ifelse( cond1, a_j,
                        TT.switch( cond2, a_lo, a_hi), name='mahi')
    phi_hi  = ifelse( cond1, phi_aj,
                        TT.switch( cond2, phi_lo, phi_hi), name='mphihi')

    onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                       phi_aj < phi_lo),
                       abs(derphi_aj) <= -c2*derphi0)

    a_lo      = TT.switch(cond1, a_lo, a_j)
    phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero),0)]
    print'while_zoom'
    outs, updates = scan(while_zoom,
                         states = states,
                         n_steps = maxiter,
                         name = 'while_zoom',
                         mode = mode,
                         profile = profile)
    print 'done_while'
    a_star   = ifelse(onlyif, a_j   , outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime
Beispiel #10
0
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi - a_lo
        a = TT.switch(dalpha < zero, a_hi, a_lo)
        b = TT.switch(dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1 * dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo,
                              a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2 * dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',
                         TT.isnan(a_j_quad),
                         a_j_quad > b - qchk,
                         a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX) * \
                             dalpha, a_j_quad)

        # pick between the two ..
        cond_c = lazy_or('condc',
                         TT.isnan(a_j_cubic),
                         TT.bitwise_or(a_j_cubic > b - cchk,
                                       a_j_cubic < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop',
                        TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                       phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2 * derphi0)

        cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj * (a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse(cond1,
                         phi_hi,
                         TT.switch(cond2, phi_hi, phi_lo),
                         name='phi_rec')
        a_rec = ifelse(cond1,
                       a_hi,
                       TT.switch(cond2, a_hi, a_lo),
                         name='a_rec')
        a_hi = ifelse(cond1, a_j,
                      TT.switch(cond2, a_lo, a_hi),
                      name='a_hi')
        phi_hi = ifelse(cond1, phi_aj,
                        TT.switch(cond2, phi_lo, phi_hi),
                        name='phi_hi')

        a_lo = TT.switch(cond1, a_lo, a_j)
        phi_lo = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan,
                          TT.switch(cond2, derphi_aj, nan), name='valprime')

        return ([phi_rec,
                 a_rec,
                 a_lo,
                 a_hi,
                 phi_hi,
                 phi_lo,
                 derphi_lo,
                 a_star,
                 val_star,
                 valprime],
                theano.scan_module.scan_utils.until(stop))
Beispiel #11
0
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
          phi, derphi, phi0, derphi0, c1, c2,
          n_iters=10,
          profile=False):
    """
    WRITEME

    Part of the optimization algorithm in `scalar_search_wolfe2`.

    Parameters
    ----------
    a_lo : float
        Step size
    a_hi : float
        Step size
    phi_lo : float
        Value of f at a_lo
    phi_hi : float
        Value of f at a_hi
    derphi_lo : float
        Value of derivative at a_lo
    phi : callable
        Generates computational graph
    derphi : callable
        Generates computational graph
    phi0 : float
        Value of f at 0
    derphi0 : float
        Value of the derivative at 0
    c1 : float
        Wolfe parameter
    c2 : float
        Wolfe parameter
    profile : bool
        True if you want printouts of profiling information
    """
    # Function reprensenting the computations of one step of the while loop
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi - a_lo
        a = TT.switch(dalpha < zero, a_hi, a_lo)
        b = TT.switch(dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1 * dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo,
                              a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2 * dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',
                         TT.isnan(a_j_quad),
                         a_j_quad > b - qchk,
                         a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX) * \
                             dalpha, a_j_quad)

        # pick between the two ..
        cond_c = lazy_or('condc',
                         TT.isnan(a_j_cubic),
                         TT.bitwise_or(a_j_cubic > b - cchk,
                                       a_j_cubic < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop',
                        TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                       phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2 * derphi0)

        cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj * (a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse(cond1,
                         phi_hi,
                         TT.switch(cond2, phi_hi, phi_lo),
                         name='phi_rec')
        a_rec = ifelse(cond1,
                       a_hi,
                       TT.switch(cond2, a_hi, a_lo),
                         name='a_rec')
        a_hi = ifelse(cond1, a_j,
                      TT.switch(cond2, a_lo, a_hi),
                      name='a_hi')
        phi_hi = ifelse(cond1, phi_aj,
                        TT.switch(cond2, phi_lo, phi_hi),
                        name='phi_hi')

        a_lo = TT.switch(cond1, a_lo, a_j)
        phi_lo = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan,
                          TT.switch(cond2, derphi_aj, nan), name='valprime')

        return ([phi_rec,
                 a_rec,
                 a_lo,
                 a_hi,
                 phi_hi,
                 phi_lo,
                 derphi_lo,
                 a_star,
                 val_star,
                 valprime],
                theano.scan_module.scan_utils.until(stop))

    maxiter = n_iters
    # cubic interpolant check
    delta1 = TT.constant(numpy.asarray(0.2,
                                       dtype=theano.config.floatX))
    # quadratic interpolant check
    delta2 = TT.constant(numpy.asarray(0.1,
                                       dtype=theano.config.floatX))
    phi_rec = phi0
    a_rec = zero

    # Initial iteration

    dalpha = a_hi - a_lo
    a = TT.switch(dalpha < zero, a_hi, a_lo)
    b = TT.switch(dalpha < zero, a_lo, a_hi)
    #a = ifelse(dalpha < 0, a_hi, a_lo)
    #b = ifelse(dalpha < 0, a_lo, a_hi)

    # minimizer of cubic interpolant
    # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
    #
    # if the result is too close to the end points (or out of the
    # interval) then use quadratic interpolation with phi_lo,
    # derphi_lo and phi_hi if the result is stil too close to the
    # end points (or out of the interval) then use bisection

    # quadric interpolation
    qchk = delta2 * dalpha
    a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
    cond_q = lazy_or('mcond_q',
                     TT.isnan(a_j),
                     TT.bitwise_or(a_j > b - qchk,
                                   a_j < a + qchk))

    a_j = TT.switch(cond_q, a_lo +
                    numpy.asarray(0.5, dtype=theano.config.floatX) * \
                    dalpha, a_j)

    # Check new value of a_j
    phi_aj = phi(a_j)
    derphi_aj = derphi(a_j)

    cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0,
                          phi_aj >= phi_lo)
    cond2 = derphi_aj * (a_hi - a_lo) >= zero

    # Switches just make more sense here because they have a C
    # implementation and they get composed
    phi_rec = ifelse(cond1,
                     phi_hi,
                     TT.switch(cond2, phi_hi, phi_lo),
                     name='mphirec')
    a_rec = ifelse(cond1,
                   a_hi,
                   TT.switch(cond2, a_hi, a_lo),
                   name='marec')
    a_hi = ifelse(cond1,
                  a_j,
                  TT.switch(cond2, a_lo, a_hi),
                  name='mahi')
    phi_hi = ifelse(cond1,
                    phi_aj,
                    TT.switch(cond2, phi_lo, phi_hi),
                    name='mphihi')

    onlyif = lazy_and('only_if',
                      TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0,
                                     phi_aj < phi_lo),
                      abs(derphi_aj) <= -c2 * derphi0)

    a_lo = TT.switch(cond1, a_lo, a_j)
    phi_lo = TT.switch(cond1, phi_lo, phi_aj)
    derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main')
    phi_rec.name = 'phi_rec'
    a_rec.name = 'a_rec'
    a_lo.name = 'a_lo'
    a_hi.name = 'a_hi'
    phi_hi.name = 'phi_hi'
    phi_lo.name = 'phi_lo'
    derphi_lo.name = 'derphi_lo'
    vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan),
                        name='vderphi_aj')
    states = []
    states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    states += [TT.unbroadcast(TT.shape_padleft(zero), 0)]
    # print'while_zoom'
    outs, updates = scan(while_zoom,
                         states=states,
                         n_steps=maxiter,
                         name='while_zoom',
                         mode=theano.Mode(linker='cvm_nogc'),
                         profile=profile)
    # print 'done_while'
    a_star = ifelse(onlyif, a_j, outs[7][0], name='astar')
    val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar')
    valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime')

    ## WARNING !! I ignore updates given by scan which I should not do !!!
    return a_star, val_star, valprime
Beispiel #12
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]
        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1: 1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None]*n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out+eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout * (numpy.float32(1) -
                                               tnwout))*factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [x for x in model.params if
                              x in theano.gof.graph.inputs([nw_out])]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [oj + final_results[p] for oj, p in
                     zip(args[1+n_params:1+2*n_params], model.params)]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [TT.unbroadcast(TT.alloc(const(0), 1, *shp),0)
              for shp in model.params_shape]
        ij = [TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp),0)
              for shp in model.params_shape]
        idx0 = TT.unbroadcast(const([0]),0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1: 1 + n_params]]
        nw_js = [x[0] for x in rvals[1+n_params:1+2*n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx*options['gbs']:(gbdx+1)*options['gbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx],
            [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp)
                  for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [x for x in model.params
                                  if x in theano.gof.graph.inputs([nw_out])]
                    loc_args = [x for x, y in zip(args, model.params)
                                if y in theano.gof.graph.inputs([nw_out])]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(options['cbs'])# * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [ogv + final_results[param]
                       for (ogv, param) in zip(gv_args[1:], model.params)]
                return [gv_args[0] + const(1)] + Gvs
            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates
        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x ** 2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(
            compute_Gv,
            [x / norm_grads for x in self.gs],
            Ms = self.js,
            rtol=options['mrtol'],
            shift=self.damping,
            maxit=options['miters'],
            profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0,
                                   TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r*g).sum() for r,g in zip(self.rs, self.gs)])
        norm_kk = sum([(r*g).sum() for r,g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d*g).sum() for d,g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2*norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)),
                           beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k),
                                         TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)),
                           beta_k)

        nwds = [-r + beta_k*d for r,d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx*options['mbs']:(rbdx+1)*options['mbs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx],
            [flag,
             niters,
             rel_residual,
             rel_Aresidual,
             Anorm,
             Acond,
             xnorm,
             Axnorm,
             norm_grads,
             norm_ord0,
             beta_k],
            updates=updates,
            allow_input_downcast = True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [ -r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r*r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates = dict(zip(model.params,
                                                                newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function([],
                                                updates=dict(zip(self.ds +
                                                                 [self.norm_d],
                                                                 nw_ds +
                                                                 [nw_normd])),
                                                name='reset_dirs',
                                                on_unused_input='warn',
                                                mode=cpu_mode,
                                                allow_input_downcast=True,
                                                profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']
        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_cost_step,
                        states = states,
                        n_steps = n_steps,
                        name='ls_cost_step',
                        profile = options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params,
                               nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_grad_step,
                        states = states,
                        n_steps = n_steps,
                        name = 'ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:
                           (ebdx + 1) * options['ebs']])
                     for x,y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function(
            [lr, ebdx],
            fcost,
            givens = grad_inps,
            allow_input_downcast=True,
            name='ls_cost_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.approx_change = theano.function(
                [lr],
                -lr*sum([TT.sum(g*r) for g,r in zip(self.gs, self.ds)]),
                allow_input_downcast=True,
                name='approx_change',
                mode=gpu_mode,
                profile=options['profile'])


        self.ls_grad_fn = theano.function(
            [lr, ebdx],
            fgrad,
            allow_input_downcast=True,
            givens = grad_inps,
            name='ls_grad_fn',
            mode=gpu_mode,
            profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs']// options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(
                model.err, replace=replace), 'float32')
            return [_idx + const(1),
                    acc + nw_cost]

        states = [TT.constant(numpy.float32([0])),
                  TT.constant(numpy.float32([0]))]
        rvals, _ = scan(ls_error,
                        states = states,
                        n_steps = n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile = options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                           ferr,
                           givens=dict(grad_inps),
                           name='compute_err',
                           mode=cpu_mode,
                           allow_input_downcast=True,
                           on_unused_input='warn',
                           profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Beispiel #13
0
    def init_gpu(self, options, channel, data, model):
        # Step 1. Compile function for computing eucledian gradients
        eps = numpy.float32(1e-24)
        gbdx = TT.iscalar('grad_batch_idx')
        n_params = len(self.model.params)
        print 'Constructing grad function'
        loc_inputs = [x.type() for x in model.inputs]
        srng = RandomStreams(numpy.random.randint(1e5))

        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            # Compute jacobi
            nw_outs = safe_clone(model.outs, replace=replace)
            final_results = dict(zip(model.params, [None] * n_params))
            for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                if out_operator == 'sigmoid':
                    denom = numpy.float32(options['cbs'])
                    #denom *= nw_out
                    #denom *= (numpy.float32(1) - nw_out)
                elif out_operator == 'softmax':
                    denom = numpy.float32(options['cbs'])
                    denom *= (nw_out + eps)
                else:
                    denom = numpy.float32(options['cbs'])
                factor = TT.sqrt(numpy.float32(1) / denom)
                if out_operator == 'sigmoid':
                    tnwout = TT.nnet.sigmoid(nw_out)
                    factor = TT.sqrt(tnwout *
                                     (numpy.float32(1) - tnwout)) * factor
                r = TT.sgn(srng.normal(nw_out.shape))
                r = r * factor
                loc_params = [
                    x for x in model.params
                    if x in theano.gof.graph.inputs([nw_out])
                ]
                jvs = TT.Lop(nw_out, loc_params, r)
                for lp, lj in zip(loc_params, jvs):
                    if final_results[lp] is None:
                        final_results[lp] = TT.sqr(lj)
                    else:
                        final_results[lp] = final_results[lp] + TT.sqr(lj)
            nw_js = [
                oj + final_results[p]
                for oj, p in zip(args[1 + n_params:1 +
                                      2 * n_params], model.params)
            ]
            return [args[0] + const(1)] + nw_gs + nw_js

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        ij = [
            TT.unbroadcast(TT.alloc(const(options['jreg']), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig + ij,
                              n_steps=n_steps,
                              name='grad_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]
        nw_js = [x[0] for x in rvals[1 + n_params:1 + 2 * n_params]]
        updates.update(dict(zip(self.gs + self.js, nw_gs + nw_js)))
        grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            allow_input_downcast=True,
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')

        def compute_Gv(*args):
            idx0 = const([0])
            ep = [TT.alloc(const(0), 1, *shp) for shp in model.params_shape]

            def Gv_step(*gv_args):
                idx = TT.cast(gv_args[0], 'int32')
                nw_inps = [x[idx * options['cbs']: \
                             (idx + 1) * options['cbs']] for x in
                           loc_inputs]
                replace = dict(zip(model.inputs, nw_inps))
                nw_outs = safe_clone(model.outs, replace)
                final_results = dict(
                    zip(model.params, [None] * len(model.params)))
                for nw_out, out_operator in zip(nw_outs, model.outs_operator):
                    loc_params = [
                        x for x in model.params
                        if x in theano.gof.graph.inputs([nw_out])
                    ]
                    loc_args = [
                        x for x, y in zip(args, model.params)
                        if y in theano.gof.graph.inputs([nw_out])
                    ]
                    if out_operator == 'softmax':
                        factor = const(options['cbs']) * (nw_out + eps)
                    elif out_operator == 'sigmoid':
                        factor = const(
                            options['cbs'])  # * nw_out * (1 - nw_out)
                    else:
                        factor = const(options['cbs'])

                    if out_operator != 'sigmoid':
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                     TT.Rop(nw_out, loc_params, loc_args) /\
                                     factor)
                    else:
                        tnwout = TT.nnet.sigmoid(nw_out)
                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params,
                                                loc_args) *\
                                         tnwout * (1 - tnwout)/ factor)

                    for lp, lgv in zip(loc_params, loc_Gvs):
                        if final_results[lp] is None:
                            final_results[lp] = lgv
                        else:
                            final_results[lp] += lgv

                Gvs = [
                    ogv + final_results[param]
                    for (ogv, param) in zip(gv_args[1:], model.params)
                ]
                return [gv_args[0] + const(1)] + Gvs

            states = [idx0] + ep
            n_steps = options['mbs'] // options['cbs']
            rvals, updates = scan(Gv_step,
                                  states=states,
                                  n_steps=n_steps,
                                  mode=theano.Mode(linker='cvm'),
                                  name='Gv_step',
                                  profile=options['profile'])

            final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
            return final_Gvs, updates

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        self.damping = theano.shared(numpy.float32(options['mreg']))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              Ms=self.js,
                              rtol=options['mrtol'],
                              shift=self.damping,
                              maxit=options['miters'],
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        reset = TT.scalar(dtype='int8', name='reset')

        norm_kkm1 = sum([(r * g).sum() for r, g in zip(self.rs, self.gs)])
        norm_kk = sum([(r * g).sum() for r, g in zip(nw_rs, self.gs)])
        norm_dk = sum([(d * g).sum() for d, g in zip(self.ds, self.gs)])

        norm_y = norm_kk - 2 * norm_kkm1 + self.norm_km1km1
        beta_k = (norm_kk - norm_kkm1)/(norm_dk - self.norm_dkm1) - \
                2 * norm_y * (norm_dk/((norm_dk - self.norm_dkm1) **2))
        beta_k = TT.switch(reset, TT.constant(numpy.float32(0.)), beta_k)
        beta_k = TT.switch(TT.bitwise_or(TT.isnan(beta_k), TT.isinf(beta_k)),
                           TT.constant(numpy.float32(0.)), beta_k)

        nwds = [-r + beta_k * d for r, d in zip(nw_rs, self.ds)]
        self.nwds = nwds
        nw_normd = TT.sqrt(sum([(d*d).sum() for d in nwds])) + \
                numpy.float32(1e-25)

        updates.update(dict(zip(self.rs, nw_rs)))
        updates.update(dict(zip(self.ds, nwds)))
        updates[self.norm_km1km1] = norm_kk
        updates[self.norm_dkm1] = norm_dk
        updates[self.norm_d] = nw_normd
        print 'Compiling riemannian gradient function'
        cst = time.time()
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.compute_riemannian_gradients = theano.function(
            [reset, rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0, beta_k
            ],
            updates=updates,
            allow_input_downcast=True,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            mode=cpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        print 'Time to compile Riemannian', print_time(time.time() - cst)
        cst = time.time()
        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        newparams = [p + lr * d for p, d in zip(model.params, self.ds)]
        nw_ds = [-r for r in self.rs]
        nw_normd = TT.sqrt(sum([(r * r).sum() for r in self.rs]))
        self.update_params = theano.function([lr],
                                             updates=dict(
                                                 zip(model.params, newparams)),
                                             name='update_params',
                                             on_unused_input='warn',
                                             allow_input_downcast=True,
                                             mode=gpu_mode,
                                             profile=options['profile'])
        self.reset_directions = theano.function(
            [],
            updates=dict(zip(self.ds + [self.norm_d], nw_ds + [nw_normd])),
            name='reset_dirs',
            on_unused_input='warn',
            mode=cpu_mode,
            allow_input_downcast=True,
            profile=options['profile'])

        n_steps = options['ebs'] // options['cbs']

        def ls_cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_cost_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_cost_step',
                        profile=options['profile'])
        fcost = rvals[1][0] / const(n_steps)

        def ls_grad_step(_idx, gws):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [
                x[idx * options['cbs']:(idx + 1) * options['cbs']]
                for x in loc_inputs
            ]
            replace = dict(
                zip(model.inputs + model.params, nw_inps + newparams))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            nw_gs = TT.grad(nw_cost, lr)
            return _idx + numpy.float32(1), gws + nw_gs

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_grad_step,
                        states=states,
                        n_steps=n_steps,
                        name='ls_grad_step',
                        profile=options['profile'])

        fgrad = rvals[1][0] / const(n_steps)
        ebdx = TT.iscalar('ebdx')
        grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                     for x, y in zip(loc_inputs, self.shared_data)]
        self.ls_cost_fn = theano.function([lr, ebdx],
                                          fcost,
                                          givens=grad_inps,
                                          allow_input_downcast=True,
                                          name='ls_cost_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.approx_change = theano.function(
            [lr],
            -lr * sum([TT.sum(g * r) for g, r in zip(self.gs, self.ds)]),
            allow_input_downcast=True,
            name='approx_change',
            mode=gpu_mode,
            profile=options['profile'])

        self.ls_grad_fn = theano.function([lr, ebdx],
                                          fgrad,
                                          allow_input_downcast=True,
                                          givens=grad_inps,
                                          name='ls_grad_fn',
                                          mode=gpu_mode,
                                          profile=options['profile'])

        self.old_score = 50000
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=gpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=cpu_mode,
                                             allow_input_downcast=True,
                                             on_unused_input='warn',
                                             profile=options['profile'])
        print 'Compile eval time', print_time(time.time() - cst)
        self.old_cost = 1e6
        self.options = options
        self.perm = self.rng.permutation(4)
        self.pos = 0
Beispiel #14
0
def logic_or(x, y):
  return T.bitwise_or(x, y)
Beispiel #15
0
    def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi,
                   phi_lo, derphi_lo, a_star, val_star, valprime):
        # interpolate to find a trial step length between a_lo and
        # a_hi Need to choose interpolation here.  Use cubic
        # interpolation and then if the result is within delta *
        # dalpha or outside of the interval bounded by a_lo or a_hi
        # then use quadratic interpolation, if the result is still too
        # close, then use bisection
        dalpha = a_hi-a_lo
        a = TT.switch( dalpha < zero, a_hi, a_lo)
        b = TT.switch( dalpha < zero, a_lo, a_hi)

        # minimizer of cubic interpolant
        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
        #
        # if the result is too close to the end points (or out of the
        # interval) then use quadratic interpolation with phi_lo,
        # derphi_lo and phi_hi if the result is stil too close to the
        # end points (or out of the interval) then use bisection

        # cubic interpolation
        cchk = delta1*dalpha
        a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec)
        # quadric interpolation
        qchk = delta2*dalpha
        a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
        cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk)
        a_j_quad = TT.switch(cond_q, a_lo +
                             numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad)


        # pick between the two ..
        cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b -
                                                            cchk, a_j_cubic
                                                            < a + cchk))
        # this lazy if actually decides if we need to run the quadric
        # interpolation
        a_j = TT.switch(cond_c, a_j_quad, a_j_cubic)
        #a_j = ifelse(cond_c, a_j_quad,  a_j_cubic)

        # Check new value of a_j
        phi_aj = phi(a_j)
        derphi_aj = derphi(a_j)

        stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0,
                         phi_aj < phi_lo),
                        abs(derphi_aj) <= -c2*derphi0)


        cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0,
                              phi_aj >= phi_lo)
        cond2 = derphi_aj*(a_hi - a_lo) >= zero

        # Switches just make more sense here because they have a C
        # implementation and they get composed
        phi_rec = ifelse( cond1, phi_hi,
                            TT.switch( cond2, phi_hi, phi_lo), name =
                         'phi_rec')
        a_rec   = ifelse( cond1, a_hi,
                            TT.switch( cond2, a_hi, a_lo), name='a_rec')
        a_hi    = ifelse( cond1, a_j,
                            TT.switch( cond2, a_lo, a_hi), name='a_hi')
        phi_hi  = ifelse( cond1, phi_aj,
                            TT.switch( cond2, phi_lo, phi_hi), name='phi_hi')

        a_lo      = TT.switch(cond1, a_lo, a_j)
        phi_lo    = TT.switch(cond1, phi_lo, phi_aj)
        derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo')

        a_star = a_j
        val_star = phi_aj
        valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj,
                                                  nan), name='valprime')

        return ( [ phi_rec,
                  a_rec,
                  a_lo,
                  a_hi,
                  phi_hi,
                  phi_lo,
                  derphi_lo,
                  a_star,
                  val_star,
                  valprime],
                theano.scan_module.scan_utils.until(stop) )
Beispiel #16
0
    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3 / beta for r3 in r3s]
        r3s, upds = compute_Av(*vs)

        r3s = [r3 - shift * v for r3, v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, constantX(1.)),
                         r3 - (beta / betal) * r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = inner_product(r3s, vs)
        r3s = [r3 - (alpha / beta) * r2 for r3, r2 in zip(r3s, r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3 / M for r3, M in zip(r3s, Ms)]
            betan = sqrt_inner_product(r2s, r3s)
        else:
            betan = sqrt_inner_product(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, constantX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))

        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs * dbar + sn * alpha
        gbar = sn * dbar - cs * alpha

        eplnn = sn * betan
        dbarn = -cs * betan

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs * phi
        phi = sn * phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, constantX(0.)),
                        (v - epln * dl2 - dlta * dl) / gamma,
                        v)
              for v, dl2, dl in zip(vs, dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma, constantX(0.)),
                           sqrt_inner_product(ds),
                           constantX(numpy.inf))

        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau * d for x, d in zip(xs, ds)]

        xnorm = sqrt_inner_product(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2, x)
              for dl2, x in zip(dl2s, xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         constantX(6.), flag)
        # Estimate various norms
        rnorml = rnorm  # ||r_{k-1}||
        Anorml = Anorm
        Acondl = Acond
        relrnorml = relrnorm
        flag_no_6 = TT.neq(flag, constantX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, sqrt_inner_product(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm * xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, constantX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml * rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = constantX(1) + relrnorm
        t2 = constantX(1) + relArnorml

        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, constantX(0)),
                          TT.eq(flag, constantX(6))),
            multiple_switch(TT.le(t1, constantX(1)),
                            constantX(3),
                            TT.le(t2, constantX(1)),
                            constantX(4),
                            TT.le(relrnorm, rtol),
                            constantX(1),
                            TT.le(Anorm, constantX(1e-20)),
                            constantX(12),
                            TT.le(relArnorml, rtol),
                            constantX(10),
                            TT.ge(epsx, beta1),
                            constantX(5),
                            TT.ge(xnorm, maxxnorm),
                            constantX(6),
                            TT.ge(niter, TT.cast(maxit,
                                                 theano.config.floatX)),
                            constantX(8),
                            flag),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol * Anorm * xnorm),
                         constantX(11.),
                         flag)
        return [niter + constantX(1.),
                beta,
                betan,
                phi,
                Acond,
                cs,
                dbarn,
                eplnn,
                rnorm,
                sn,
                Tnorm,
                rnorml,
                xnorm,
                Dnorm,
                gamma,
                pnorm,
                gammal,
                Axnorm,
                relrnorm,
                relArnorml,
                Anorm,
                flag] + xs + r1s + r2s + r3s + dls + ds, upds, \
                theano.scan_module.scan_utils.until(TT.neq(flag, 0))
Beispiel #17
0
    def loop(niter,
             beta,
             betan,
             phi,
             Acond,
             cs,
             dbarn,
             eplnn,
             rnorm,
             sn,
             Tnorm,
             rnorml,
             xnorm,
             Dnorm,
             gamma,
             pnorm,
             gammal,
             Axnorm,
             relrnorm,
             relArnorml,
             Anorm,
             flag,
             *args):
        #-----------------------------------------------------------------
        ## Obtain quantities for the next Lanczos vector vk+1, k = 1, 2,...
        # The general iteration is similar to the case k = 1 with v0 = 0:
        #
        #   p1      = Operator * v1  -  beta1 * v0,
        #   alpha1  = v1'p1,
        #   q2      = p2  -  alpha1 * v1,
        #   beta2^2 = q2'q2,
        #   v2      = (1/beta2) q2.
        #
        # Again, p = betak P vk,  where  P = C**(-1).
        # .... more description needed.
        #-----------------------------------------------------------------
        xs = args[0 * n_params: 1 * n_params]
        r1s = args[1 * n_params: 2 * n_params]
        r2s = args[2 * n_params: 3 * n_params]
        r3s = args[3 * n_params: 4 * n_params]
        dls = args[4 * n_params: 5 * n_params]
        ds = args[5 * n_params: 6 * n_params]
        betal = beta
        beta = betan
        vs = [r3/beta for r3 in r3s]
        r3s = compute_Av(*vs)
        r3s = [r3 + damp*v for r3,v in zip(r3s, vs)]
        r3s = [TT.switch(TT.ge(niter, numpy.float64(1.)),
                         r3 - (beta/betal)*r1,
                         r3) for r3, r1 in zip(r3s, r1s)]

        alpha = sqnorm(r3s, vs)
        r3s = [r3 - (alpha/beta)*r2 for r3,r2 in zip(r3s,r2s)]
        r1s = [r2 for r2 in r2s]
        r2s = [r3 for r3 in r3s]
        if Ms is not None:
            r3s = [r3/M for r3, M in zip(r3s, Ms)]
            betan = norm(r2s, r3s)
        else:
            betan = norm(r3s)
        pnorml = pnorm
        pnorm = TT.switch(TT.eq(niter, npy_floatX(0.)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                          TT.sqrt(TT.sqr(alpha) + TT.sqr(betan) +
                                  TT.sqr(beta)))


        #-----------------------------------------------------------------
        ## Apply previous rotation Qk-1 to get
        #   [dlta_k epln_{k+1}] = [cs  sn][dbar_k    0      ]
        #   [gbar_k  dbar_{k+1} ]   [sn -cs][alpha_k beta_{k+1}].
        #-----------------------------------------------------------------
        dbar = dbarn
        epln = eplnn
        dlta = cs*dbar + sn*alpha
        gbar = sn*dbar - cs*alpha

        eplnn = sn*betan
        dbarn = - cs*betan;

        ## Compute the current plane rotation Qk
        gammal2 = gammal
        gammal  = gamma
        cs, sn, gamma = symGivens2(gbar, betan)
        tau = cs*phi
        phi = sn*phi
        Axnorm = TT.sqrt(TT.sqr(Axnorm) + TT.sqr(tau))
        # Update d

        dl2s = [dl for dl in dls]
        dls = [d for d in ds]
        ds = [TT.switch(TT.neq(gamma, npy_floatX(0.)),
                        (v - epln*dl2 - dlta*dl)/gamma,
                        v)
              for v,dl2,dl in zip(vs,dl2s, dls)]
        d_norm = TT.switch(TT.neq(gamma,npy_floatX(0.)),
                           norm(ds),
                           TT.constant((npy_floatX(numpy.inf))))


        # Update x except if it will become too big
        xnorml = xnorm
        dl2s = [x for x in xs]
        xs = [x + tau*d for x,d in zip(xs,ds)]

        xnorm = norm(xs)
        xs = [TT.switch(TT.ge(xnorm, maxxnorm),
                        dl2,
                        x) for dl2,x in zip(dl2s,xs)]

        flag = TT.switch(TT.ge(xnorm, maxxnorm),
                         npy_floatX(6.), flag)
        # Estimate various norms
        rnorml      = rnorm # ||r_{k-1}||
        Anorml      = Anorm
        Acondl      = Acond
        relrnorml   = relrnorm
        flag_no_6 = TT.neq(flag, npy_floatX(6.))
        Dnorm = TT.switch(flag_no_6,
                          TT.sqrt(TT.sqr(Dnorm) + TT.sqr(d_norm)),
                          Dnorm)
        xnorm = TT.switch(flag_no_6, norm(xs), xnorm)
        rnorm = TT.switch(flag_no_6, phi, rnorm)
        relrnorm = TT.switch(flag_no_6,
                             rnorm / (Anorm*xnorm + bnorm),
                             relrnorm)
        Tnorm = TT.switch(flag_no_6,
                          TT.switch(TT.eq(niter, npy_floatX(0.)),
                                    TT.sqrt(TT.sqr(alpha) + TT.sqr(betan)),
                                    TT.sqrt(TT.sqr(Tnorm) +
                                            TT.sqr(beta) +
                                            TT.sqr(alpha) +
                                            TT.sqr(betan))),
                          Tnorm)
        Anorm = TT.maximum(Anorm, pnorm)
        Acond = Anorm * Dnorm
        rootl = TT.sqrt(TT.sqr(gbar) + TT.sqr(dbarn))
        Anorml = rnorml*rootl
        relArnorml = rootl / Anorm

        #---------------------------------------------------------------
        # See if any of the stopping criteria are satisfied.
        # In rare cases, flag is already -1 from above (Abar = const*I).
        #---------------------------------------------------------------
        epsx = Anorm * xnorm * eps
        epsr = Anorm * xnorm * rtol
        #Test for singular Hk (hence singular A)
        # or x is already an LS solution (so again A must be singular).
        t1 = npy_floatX(1) + relrnorm
        t2 = npy_floatX(1) + relArnorml
        flag = TT.switch(
            TT.bitwise_or(TT.eq(flag, npy_floatX(0.)),
                          TT.eq(flag, npy_floatX(6.))),
                      TT.switch(TT.le(t1, npy_floatX(1.)),
                                npy_floatX(3.),
                      TT.switch(TT.le(t2, npy_floatX(1.)),
                                npy_floatX(4.),
                      TT.switch(TT.le(relrnorm, rtol),
                                npy_floatX(1.),
                      TT.switch(TT.le(Anorm, npy_floatX(1e-20)),
                                npy_floatX(12),
                      TT.switch(TT.le(relArnorml, rtol),
                                npy_floatX(10.),
                      TT.switch(TT.ge(epsx, beta1),
                                npy_floatX(5.),
                      TT.switch(TT.ge(xnorm, maxxnorm),
                                npy_floatX(6.),
                      TT.switch(TT.ge(niter, TT.cast(maxiter,floatX)),
                                npy_floatX(8.),
                                flag)))))))),
            flag)

        flag = TT.switch(TT.lt(Axnorm, rtol*Anorm*xnorm),
                               npy_floatX(11.), flag)
        return [
            niter + npy_floatX(1.),
            beta,
            betan,
            phi,
            Acond,
            cs,
            dbarn,
            eplnn,
            rnorm,
            sn,
            Tnorm,
            rnorml,
            xnorm,
            Dnorm,
            gamma,
            pnorm,
            gammal,
            Axnorm,
            relrnorm,
            relArnorml,
            Anorm,
            flag] + xs + r1s + r2s + r3s + dls + ds, \
                theano.scan_module.scan_utils.until(TT.neq(flag,0))
Beispiel #18
0
    def fmin_cg_loop(old_fval, old_old_fval, *rest):
        xks  = rest[:n_elems]
        gfks = rest[n_elems:n_elems * 2]

        maxs = [ abs(gfk).max(axis=range(gfk.ndim)) for gfk in gfks ]
        if len(maxs) == 1:
            gnorm = maxs[0]
        else:
            gnorm = TT.maximum(maxs[0], maxs[1])
            for dx in maxs[2:]:
                gnorm = TT.maximum(gnorm, dx)

        pks  = rest[n_elems*2:]
        deltak = sum((gfk * gfk).sum() for gfk in gfks)

        old_fval_backup = old_fval
        old_old_fval_backup = old_old_fval

        alpha_k, old_fval, old_old_fval, derphi0, nw_gfks = \
                linesearch.line_search_wolfe2(f,myfprime, xks, pks,
                                              old_fval_backup,
                                              old_old_fval_backup,
                                              profile = profile,
                                             gfks = gfks)



        xks = [ ifelse(gnorm <= gtol, xk,
                              ifelse(TT.bitwise_or(TT.isnan(alpha_k),
                                                          TT.eq(alpha_k,
                                                                zero)), xk,
                                            xk+alpha_k*pk)) for xk, pk in zip(xks,pks)]
        gfkp1s_tmp = myfprime(*xks)
        gfkp1s = [ ifelse(TT.isnan(derphi0), nw_x, x) for nw_x, x in
                  zip(gfkp1s_tmp, nw_gfks)]


        yks = [gfkp1 - gfk for gfkp1, gfk in izip(gfkp1s, gfks)]
        # Polak-Ribiere formula.
        beta_k = TT.maximum(
                zero,
                sum((x * y).sum() for x, y in izip(yks, gfkp1s)) / deltak)
        pks  = [ ifelse(gnorm <= gtol, pk,
                               ifelse(TT.bitwise_or(TT.isnan(alpha_k),
                                                           TT.eq(alpha_k,
                                                                 zero)), pk, -gfkp1 +
                                             beta_k * pk)) for gfkp1,pk in zip(gfkp1s,pks) ]
        gfks = [ifelse(gnorm <= gtol,
                       gfk,
                       ifelse(
                           TT.bitwise_or(TT.isnan(alpha_k),
                                         TT.eq(alpha_k, zero)),
                           gfk,
                           gfkp1))
                for (gfk, gfkp1) in izip(gfks, gfkp1s)]

        stop = lazy_or(gnorm <= gtol, TT.bitwise_or(TT.isnan(alpha_k),
                                                TT.eq(alpha_k, zero)))# warnflag = 2
        old_fval     = ifelse(gnorm >gtol, old_fval, old_fval_backup)
        old_old_fval = ifelse(gnorm >gtol, old_old_fval,
                                     old_old_fval_backup)
        return ([old_fval, old_old_fval]+xks + gfks + pks,
                until(stop))