Example #1
0
def sgd_short_min(L_grad, meta, x0, alpha, gamma, N_iters,
                callback=None, forward_pass_only=True):

    N_safe_sampling = 1000
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    x_init = X.val
    for i in range(N_iters):
        g = L_grad(X.val, meta, i, record_results=True)
        if callback: callback(X.val, V.val, g, i)
        V.mul(gamma).sub((1.0 - gamma) * g)
        X.add(alpha * V.val)
    x_final = X.val
    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        beta = np.linspace(0.001,0.999,N_safe_sampling)
        for i in range(N_safe_sampling)[::-1]:
            x_current = (1-beta[i])*x_init + beta[i]*x_final
            d_v += d_x * alpha
            d_x -= (1.0 - gamma) * L_hvp_x(x_current, meta, d_v, i)
            d_meta -= (1.0 - gamma) * L_hvp_meta(x_current, meta, d_v, i)
            d_v *= gamma
        # assert np.all(ExactRep(x0).val == X.val)
        return d_meta

    return x_final, [None, hypergrad]
Example #2
0
def sgd_meta_only(L_grad, meta, x0, alpha, beta, N_iters,
                  callback=None, forward_pass_only=True):
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    for i in range(N_iters):
        g = L_grad(X.val, meta, i, record_results=True)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val
    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i in range(N_iters)[::-1]:
            X.sub(alpha * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_meta

    return x_final, [None, hypergrad]
Example #3
0
def sgd4(L_grad, hypers, callback=None, forward_pass_only=True):
    x0, alphas, betas, meta = hypers
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(len(alphas)), np.zeros(len(betas))
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0) # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:
            d_alphas[i] = np.dot(d_x, V.val)
            X.sub(alpha * V.val)               # Reverse position update
            g = L_grad(X.val, meta, i)         # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_betas[i] = np.dot(d_v, V.val + g)
            d_x    -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v    *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta

    return x_final, [None, hypergrad]
Example #4
0
def sgd_meta_only(L_grad, meta, x0, alpha, beta, N_iters,
                  callback=None, forward_pass_only=True):
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    for i in range(N_iters):
        g = L_grad(X.val, meta, i, record_results=True)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val
    if forward_pass_only:
        return x_final
    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0) # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output.
        for i in range(N_iters)[::-1]:
            X.sub(alpha * V.val)               # Reverse position update
            g = L_grad(X.val, meta, i)         # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_x    -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v    *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_meta
    return x_final, [None, hypergrad]
Example #5
0
def sgd4(L_grad, hypers, callback=None, forward_pass_only=True):
    x0, alphas, betas, meta = hypers
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(len(alphas)), np.zeros(len(betas))
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:
            d_alphas[i] = np.dot(d_x, V.val)
            X.sub(alpha * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_betas[i] = np.dot(d_v, V.val + g)
            d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta

    return x_final, [None, hypergrad]
def sgd_short_min(L_grad, meta, x0, alpha, gamma, N_iters,
                callback=None, forward_pass_only=True):

    N_safe_sampling = 1000
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    x_init = X.val
    for i in range(N_iters):
        g = L_grad(X.val, meta, i, record_results=True)
        if callback: callback(X.val, V.val, g, i)
        V.mul(gamma).sub((1.0 - gamma) * g)
        X.add(alpha * V.val)
    x_final = X.val
    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        beta = np.linspace(0.001,0.999,N_safe_sampling)
        for i in range(N_safe_sampling)[::-1]:
            x_current = (1-beta[i])*x_init + beta[i]*x_final
            d_v += d_x * alpha
            d_x -= (1.0 - gamma) * L_hvp_x(x_current, meta, d_v, i)
            d_meta -= (1.0 - gamma) * L_hvp_meta(x_current, meta, d_v, i)
            d_v *= gamma
        # assert np.all(ExactRep(x0).val == X.val)
        return d_meta

    return x_final, [None, hypergrad]
Example #7
0
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False):
    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                                                        loss_fun(X.val, batches.all_idxs))
            
    X, V = ExactRep(x), ExactRep(v)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter/len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    loss_grad = grad(loss_fun)
    loss_hvp = grad(lambda x, d, idxs : np.dot(loss_grad(x, idxs), d))
    learning_curve = [loss_fun(x_orig, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = loss_grad(X.val, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        if record_learning_curve and (i+1) % iter_per_epoch == 0:
            learning_curve.append(loss_fun(X.val, batches.all_idxs))
        #print_perf()

    x_final = X.val
    d_x = loss_grad(X.val, batches.all_idxs)
    loss_final = loss_fun(x_final, batches.all_idxs)
    d_v = np.zeros(d_x.shape)
    d_alphas = deque()
    d_betas = deque()
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        print_perf()
        d_v += d_x * alpha
        X.sub(alpha * V.val)
        g = loss_grad(X.val, batch)
        d_alphas.appendleft(np.dot(d_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        d_betas.appendleft(np.dot(d_v, V.val + g))
        d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch)
        d_v = d_v * beta

    d_alphas = np.array(d_alphas)
    d_betas = np.array(d_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final'    : x_final,
            'learning_curve' : learning_curve,
            'loss_final' : loss_final,
            'd_x' : d_x,
            'd_v' : d_v,
            'd_alphas' : d_alphas,
            'd_betas'  : d_betas}
Example #8
0
def sgd_parsed_mad(L_grad, hypers, parser, callback=None, forward_pass_only=True):
    """This version has alphas and betas be TxN_weight_types matrices.
       parser is a dict containing the indices for the different types of weights."""
    x0, alphas, gammas, meta = hypers
    N_safe_sampling = len(alphas)
    x_init = np.copy(x0)
    x_current = np.copy(x0)
    global v_current
    v_current = np.zeros(x0.size)
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, gammas)
    for i, alpha, gamma in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        cur_alpha_vect = fill_parser(parser, alpha)
        cur_gamma_vect  = fill_parser(parser, gamma)
        V.mul(cur_gamma_vect).sub((1.0 - cur_gamma_vect) * g)
        X.add(cur_alpha_vect * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        global v_current
        v = v_current
        d_alphas, d_gammas = np.zeros(alphas.shape), np.zeros(gammas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        beta = np.linspace(0.001, 0.999, N_safe_sampling)
        for i, alpha, gamma in iters[::-1]:
            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_gamma_vect  = fill_parser(parser, gamma)

            x = (1 - beta[i]) * x_init + beta[i] * x_final
            x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final
            v = (np.subtract(x, x_previous)) / cur_alpha_vect  # recover velocity
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i,j] = np.dot(d_x[ixs], v[ixs])
            g = L_grad(x, meta, i)                           # Evaluate gradient

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_gammas[i,j] = np.dot(d_v[ixs], v[ixs] + g[ixs])

            d_x    -= L_hvp_x(x, meta, (1.0 - cur_gamma_vect)*d_v, i)
            d_meta -= L_hvp_meta(x, meta, (1.0 - cur_gamma_vect)* d_v, i)
            d_v    *= cur_gamma_vect
        # assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_gammas, d_meta

    return x_final, [None, hypergrad]
def sgd4_mad(L_grad, hypers, callback=None, forward_pass_only=True):

    x0, alphas, gammas, meta = hypers
    N_safe_sampling = len(alphas)
    x_init = np.copy(x0)
    x_current = np.copy(x0)
    global v_current
    v_current = np.zeros(x0.size)
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, gammas)
    for i, alpha, gamma in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        V.mul(gamma).sub((1.0 - gamma) * g)
        X.add(alpha * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        global v_current
        v = v_current
        d_alphas, d_gammas = np.zeros(len(alphas)), np.zeros(len(gammas))
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(gamma) output.

        beta = np.linspace(0.001, 0.999,
                           N_safe_sampling)  #evenly spaced, Michael

        for i, alpha, gamma in iters[::-1]:

            # Here is the averaging sequence, Michael
            x = (1 - beta[i]) * x_init + beta[i] * x_final

            x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final
            v = np.subtract(x, x_previous) / alpha  #recover velocity
            d_alphas[i] = np.dot(d_x, v)
            g = L_grad(x, meta, i)  # Evaluate gradient
            # v = (v+(1.0 - gamma)*g)/gamma
            d_v += d_x * alpha
            d_gammas[i] = np.dot(d_v, v + g)
            d_x -= (1.0 - gamma) * L_hvp_x(
                x, meta, d_v,
                i)  #DrMad paper forgot to mention this line, Michael
            d_meta -= (1.0 - gamma) * L_hvp_meta(x, meta, d_v, i)
            d_v *= gamma  #DrMad paper forgot to mention this line, Michael
        # assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_gammas, d_meta

    return x_final, [None, hypergrad]
def sgd_parsed(L_grad, hypers, parser, callback=None, forward_pass_only=True):
    """This version has alphas and betas be TxN_weight_types matrices.
       parser is a dict containing the indices for the different types of weights."""
    x0, alphas, betas, meta = hypers
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        cur_alpha_vect = fill_parser(parser, alpha)
        cur_beta_vect = fill_parser(parser, beta)
        V.mul(cur_beta_vect).sub((1.0 - cur_beta_vect) * g)
        X.add(cur_alpha_vect * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect = fill_parser(parser, beta)
            for j, (_, (ixs,
                        _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs])

            X.sub(cur_alpha_vect * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - cur_beta_vect) * g).div(
                cur_beta_vect)  # Reverse momentum update

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs,
                        _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs])

            d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_v *= cur_beta_vect
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta

    return x_final, [None, hypergrad]
Example #11
0
def sgd_parsed(L_grad, hypers, parser, callback=None, forward_pass_only=True):
    """This version has alphas and betas be TxN_weight_types matrices.
       parser is a dict containing the indices for the different types of weights."""
    x0, alphas, betas, meta = hypers
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        cur_alpha_vect = fill_parser(parser, alpha)
        cur_beta_vect  = fill_parser(parser, beta)
        V.mul(cur_beta_vect).sub((1.0 - cur_beta_vect) * g)
        X.add(cur_alpha_vect * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect  = fill_parser(parser, beta)
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i,j] = np.dot(d_x[ixs], V.val[ixs])

            X.sub(cur_alpha_vect * V.val)                        # Reverse position update
            g = L_grad(X.val, meta, i)                           # Evaluate gradient
            V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect)  # Reverse momentum update

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i,j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs])

            d_x    -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect)*d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect)* d_v, i)
            d_v    *= cur_beta_vect
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta

    return x_final, [None, hypergrad]
Example #12
0
    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect = fill_parser(parser, beta)
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs])

            X.sub(cur_alpha_vect * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect)  # Reverse momentum update

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs])

            d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_v *= cur_beta_vect
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta
def sgd_meta_only_sub(L_grad_sub,
                      meta,
                      x0,
                      alpha,
                      beta,
                      N_iters,
                      N_sub,
                      callback=None,
                      forward_pass_only=True):
    # Allows adding the gradient of multiple sub-batches (minibatch within a minibatch)
    # Signature of L_grad_sub is x, meta, i_batch, i_sub
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    L_grad = sum_args(L_grad_sub, range(N_sub))
    for i in range(N_iters):
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj_sub = lambda x, meta, d, i, i_sub: np.dot(
            L_grad_sub(x, meta, i, i_sub), d)
        L_hvp_x_sub = grad(grad_proj_sub, 0)  # Returns a size(x) output.
        L_hvp_meta_sub = grad(grad_proj_sub, 1)  # Returns a size(meta) output.
        L_hvp_x = sum_args(L_hvp_x_sub, range(N_sub))
        L_hvp_meta = sum_args(L_hvp_x_sub, range(N_sub))
        for i in range(N_iters)[::-1]:
            X.sub(alpha * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_meta

    return x_final, [None, hypergrad]
Example #14
0
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False):
    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                                                         loss_fun(X.val, batches.all_idxs))

    X, V = ExactRep(x), ExactRep(v)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter / len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    loss_grad = grad(loss_fun)
    loss_hvp = grad(lambda x, d, idxs: np.dot(loss_grad(x, idxs), d))
    learning_curve = [loss_fun(x_orig, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = loss_grad(X.val, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        if record_learning_curve and (i + 1) % iter_per_epoch == 0:
            learning_curve.append(loss_fun(X.val, batches.all_idxs))
            # print_perf()

    x_final = X.val
    d_x = loss_grad(X.val, batches.all_idxs)
    loss_final = loss_fun(x_final, batches.all_idxs)
    d_v = np.zeros(d_x.shape)
    d_alphas = deque()
    d_betas = deque()
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        print_perf()
        d_v += d_x * alpha
        X.sub(alpha * V.val)
        g = loss_grad(X.val, batch)
        d_alphas.appendleft(np.dot(d_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        d_betas.appendleft(np.dot(d_v, V.val + g))
        d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch)
        d_v = d_v * beta

    d_alphas = np.array(d_alphas)
    d_betas = np.array(d_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final': x_final,
            'learning_curve': learning_curve,
            'loss_final': loss_final,
            'd_x': d_x,
            'd_v': d_v,
            'd_alphas': d_alphas,
            'd_betas': d_betas}
Example #15
0
def sgd3(optimizing_loss, secondary_loss, x0, v0, alphas, betas, meta, callback=None):
    """Same as sgd2 but simplifies things by not bothering with grads of
    optimizing loss (can always just pass that in as the secondary loss)"""
    X, V = ExactRep(x0), ExactRep(v0)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
    L_hvp_x    = grad(grad_proj, 0) # Returns a size(x) output.
    L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output.
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        if callback: callback(X.val, i)
        g = L_grad(X.val, meta, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val
    M_grad      = grad(secondary_loss, 0)  # Gradient wrt parameters.
    M_meta_grad = grad(secondary_loss, 1)  # Gradient wrt metaparameters.
    dMd_x = M_grad(X.val, meta)
    dMd_v = np.zeros(dMd_x.shape)
    dMd_alphas = deque()
    dMd_betas  = deque()
    dMd_meta = M_meta_grad(X.val, meta)
    for i, alpha, beta in iters[::-1]:
        dMd_alphas.appendleft(np.dot(dMd_x, V.val))
        X.sub(alpha * V.val)
        g = L_grad(X.val, meta, i)
        V.add((1.0 - beta) * g).div(beta)
        dMd_v += dMd_x * alpha
        dMd_betas.appendleft(np.dot(dMd_v, V.val + g))
        dMd_x    -= (1.0 - beta) * L_hvp_x(X.val, meta, dMd_v, i)
        dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, i)
        dMd_v    *= beta

    assert np.all(ExactRep(x0).val == X.val)
    return {'x_final' : x_final,
            'dMd_x'      : dMd_x,
            'dMd_v'      : dMd_v,
            'dMd_alphas' : dMd_alphas,
            'dMd_betas'  : dMd_betas,
            'dMd_meta'   : dMd_meta}
def sgd3(optimizing_loss,
         secondary_loss,
         x0,
         v0,
         alphas,
         betas,
         meta,
         callback=None):
    """Same as sgd2 but simplifies things by not bothering with grads of
    optimizing loss (can always just pass that in as the secondary loss)"""
    X, V = ExactRep(x0), ExactRep(v0)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
    L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
    L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        if callback: callback(X.val, i)
        g = L_grad(X.val, meta, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val
    M_grad = grad(secondary_loss, 0)  # Gradient wrt parameters.
    M_meta_grad = grad(secondary_loss, 1)  # Gradient wrt metaparameters.
    dMd_x = M_grad(X.val, meta)
    dMd_v = np.zeros(dMd_x.shape)
    dMd_alphas = deque()
    dMd_betas = deque()
    dMd_meta = M_meta_grad(X.val, meta)
    for i, alpha, beta in iters[::-1]:
        dMd_alphas.appendleft(np.dot(dMd_x, V.val))
        X.sub(alpha * V.val)
        g = L_grad(X.val, meta, i)
        V.add((1.0 - beta) * g).div(beta)
        dMd_v += dMd_x * alpha
        dMd_betas.appendleft(np.dot(dMd_v, V.val + g))
        dMd_x -= (1.0 - beta) * L_hvp_x(X.val, meta, dMd_v, i)
        dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, i)
        dMd_v *= beta

    assert np.all(ExactRep(x0).val == X.val)
    return {
        'x_final': x_final,
        'dMd_x': dMd_x,
        'dMd_v': dMd_v,
        'dMd_alphas': dMd_alphas,
        'dMd_betas': dMd_betas,
        'dMd_meta': dMd_meta
    }
Example #17
0
def sgd_meta_only_sub(L_grad_sub, meta, x0, alpha, beta, N_iters,
                        N_sub, callback=None, forward_pass_only=True):
    # Allows adding the gradient of multiple sub-batches (minibatch within a minibatch)
    # Signature of L_grad_sub is x, meta, i_batch, i_sub
    X, V = ExactRep(x0), ExactRep(np.zeros(x0.size))
    L_grad = sum_args(L_grad_sub, range(N_sub))
    for i in range(N_iters):
        g = L_grad(X.val, meta, i)
        if callback: callback(X.val, V.val, g, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val

    if forward_pass_only:
        return x_final

    def hypergrad(outgrad):
        d_x = outgrad
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj_sub = lambda x, meta, d, i, i_sub: np.dot(L_grad_sub(x, meta, i, i_sub), d)
        L_hvp_x_sub    = grad(grad_proj_sub, 0) # Returns a size(x) output.
        L_hvp_meta_sub = grad(grad_proj_sub, 1) # Returns a size(meta) output.
        L_hvp_x    = sum_args(L_hvp_x_sub, range(N_sub))
        L_hvp_meta = sum_args(L_hvp_x_sub, range(N_sub))
        for i in range(N_iters)[::-1]:
            X.sub(alpha * V.val)               # Reverse position update
            g = L_grad(X.val, meta, i)         # Evaluate gradient
            V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
            d_v += d_x * alpha
            d_x    -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
            d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
            d_v    *= beta
        assert np.all(ExactRep(x0).val == X.val)
        return d_meta

    return x_final, [None, hypergrad]
Example #18
0
 def hypergrad(outgrad):
     d_x = outgrad
     d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
     grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
     L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
     L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
     for i in range(N_iters)[::-1]:
         X.sub(alpha * V.val)  # Reverse position update
         g = L_grad(X.val, meta, i)  # Evaluate gradient
         V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
         d_v += d_x * alpha
         d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
         d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
         d_v *= beta
     assert np.all(ExactRep(x0).val == X.val)
     return d_meta
Example #19
0
def sgd2(optimizing_loss, secondary_loss, batches, N_iter, x0, v0, alphas, betas, meta):
    """
    This version takes a secondary loss, and also returns gradients w.r.t. the data.
    :param optimizing_loss: The loss to be optimized by SGD.
    The first argument must be the parameters, the second must be the metaparameters,
    the third is data indicies.
    :param secondary_loss: Another loss we want to compute the gradient wrt.
    It takes parameters and metaparameters.
    :param batches: A list of slices into the data.
    :param N_iter: Number of iterations of SGD.
    :param x0: Starting parameter values.
    :param v0: Starting velocity.  Should probably be zero.
    :param alphas: Stepsize schedule.
    :param betas: Drag schedule.
    :param meta: A second parameter of the loss function that doesn't get optimized here.
    :return:
    a dict containing:
    Gradients wrt x0, v0, alphas, beta, and meta.
    """
    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                optimizing_loss(X.val, meta, batches.all_idxs))

    X, V = ExactRep(x0), ExactRep(v0)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter/len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    L_grad      = grad(optimizing_loss)    # Gradient wrt parameters.
    M_grad      = grad(secondary_loss)     # Gradient wrt parameters.
    L_meta_grad = grad(optimizing_loss, 1) # Gradient wrt metaparameters.
    M_meta_grad = grad(secondary_loss, 1)  # Gradient wrt metaparameters.
    L_hvp      = grad(lambda x, d, idxs:
                      np.dot(L_grad(x, meta, idxs), d))    # Hessian-vector product.
    L_hvp_meta = grad(lambda x, meta, d, idxs:
                      np.dot(L_grad(x, meta, idxs), d), 1) # Returns a size(meta) output.

    learning_curve = [optimizing_loss(X.val, meta, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = L_grad(X.val, meta, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        learning_curve.append(optimizing_loss(X.val, meta, batches.all_idxs))
        #print_perf()

    x_final = X.val
    dLd_x = L_grad(X.val, meta, batches.all_idxs)
    dMd_x = M_grad(X.val, meta)
    L_final = optimizing_loss(x_final, meta, batches.all_idxs)
    M_final = secondary_loss(x_final, meta)
    dLd_v = np.zeros(dLd_x.shape)
    dMd_v = np.zeros(dMd_x.shape)
    dLd_alphas = deque()
    dLd_betas  = deque()
    dMd_alphas = deque()
    dMd_betas  = deque()
    dLd_meta = L_meta_grad(X.val, meta, batches.all_idxs)
    dMd_meta = M_meta_grad(X.val, meta)
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        #print_perf()
        dLd_v += dLd_x * alpha
        dMd_v += dMd_x * alpha
        X.sub(alpha * V.val)
        g = L_grad(X.val, meta, batch)
        dLd_alphas.appendleft(np.dot(dLd_x, V.val))
        dMd_alphas.appendleft(np.dot(dMd_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        dLd_betas.appendleft(np.dot(dLd_v, V.val + g))
        dMd_betas.appendleft(np.dot(dMd_v, V.val + g))
        dLd_x    -= (1.0 - beta) * L_hvp(X.val, dLd_v, batch)
        dMd_x    -= (1.0 - beta) * L_hvp(X.val, dMd_v, batch)
        dLd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dLd_v, batch)
        dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, batch)
        dLd_v = dLd_v * beta
        dMd_v = dMd_v * beta

    dLd_alphas = np.array(dLd_alphas)
    dLd_betas = np.array(dLd_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final' : x_final,
            'learning_curve' : learning_curve,
            'L_final' : L_final,
            'M_final' : M_final,
            'dLd_x' : dLd_x,
            'dMd_x' : dMd_x,
            'dLd_v' : dLd_v,
            'dMd_v' : dMd_v,
            'dLd_alphas' : dLd_alphas,
            'dMd_alphas' : dMd_alphas,
            'dLd_betas' : dLd_betas,
            'dMd_betas' : dMd_betas,
            'dLd_meta'  : dLd_meta,
            'dMd_meta'  : dMd_meta}
Example #20
0
def sgd2(optimizing_loss, secondary_loss, batches, N_iter, x0, v0, alphas, betas, meta):
    """
    This version takes a secondary loss, and also returns gradients w.r.t. the data.
    :param optimizing_loss: The loss to be optimized by SGD.
    The first argument must be the parameters, the second must be the metaparameters,
    the third is data indicies.
    :param secondary_loss: Another loss we want to compute the gradient wrt.
    It takes parameters and metaparameters.
    :param batches: A list of slices into the data.
    :param N_iter: Number of iterations of SGD.
    :param x0: Starting parameter values.
    :param v0: Starting velocity.  Should probably be zero.
    :param alphas: Stepsize schedule.
    :param betas: Drag schedule.
    :param meta: A second parameter of the loss function that doesn't get optimized here.
    :return:
    a dict containing:
    Gradients wrt x0, v0, alphas, beta, and meta.
    """

    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                                                         optimizing_loss(X.val, meta, batches.all_idxs))

    X, V = ExactRep(x0), ExactRep(v0)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter / len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    M_grad = grad(secondary_loss)  # Gradient wrt parameters.
    L_meta_grad = grad(optimizing_loss, 1)  # Gradient wrt metaparameters.
    M_meta_grad = grad(secondary_loss, 1)  # Gradient wrt metaparameters.
    L_hvp = grad(lambda x, d, idxs:
                 np.dot(L_grad(x, meta, idxs), d))  # Hessian-vector product.
    L_hvp_meta = grad(lambda x, meta, d, idxs:
                      np.dot(L_grad(x, meta, idxs), d), 1)  # Returns a size(meta) output.

    learning_curve = [optimizing_loss(X.val, meta, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = L_grad(X.val, meta, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        learning_curve.append(optimizing_loss(X.val, meta, batches.all_idxs))
        # print_perf()

    x_final = X.val
    dLd_x = L_grad(X.val, meta, batches.all_idxs)
    dMd_x = M_grad(X.val, meta)
    L_final = optimizing_loss(x_final, meta, batches.all_idxs)
    M_final = secondary_loss(x_final, meta)
    dLd_v = np.zeros(dLd_x.shape)
    dMd_v = np.zeros(dMd_x.shape)
    dLd_alphas = deque()
    dLd_betas = deque()
    dMd_alphas = deque()
    dMd_betas = deque()
    dLd_meta = L_meta_grad(X.val, meta, batches.all_idxs)
    dMd_meta = M_meta_grad(X.val, meta)
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        # print_perf()
        dLd_v += dLd_x * alpha
        dMd_v += dMd_x * alpha
        X.sub(alpha * V.val)
        g = L_grad(X.val, meta, batch)
        dLd_alphas.appendleft(np.dot(dLd_x, V.val))
        dMd_alphas.appendleft(np.dot(dMd_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        dLd_betas.appendleft(np.dot(dLd_v, V.val + g))
        dMd_betas.appendleft(np.dot(dMd_v, V.val + g))
        dLd_x -= (1.0 - beta) * L_hvp(X.val, dLd_v, batch)
        dMd_x -= (1.0 - beta) * L_hvp(X.val, dMd_v, batch)
        dLd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dLd_v, batch)
        dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, batch)
        dLd_v = dLd_v * beta
        dMd_v = dMd_v * beta

    dLd_alphas = np.array(dLd_alphas)
    dLd_betas = np.array(dLd_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final': x_final,
            'learning_curve': learning_curve,
            'L_final': L_final,
            'M_final': M_final,
            'dLd_x': dLd_x,
            'dMd_x': dMd_x,
            'dLd_v': dLd_v,
            'dMd_v': dMd_v,
            'dLd_alphas': dLd_alphas,
            'dMd_alphas': dMd_alphas,
            'dLd_betas': dLd_betas,
            'dMd_betas': dMd_betas,
            'dLd_meta': dLd_meta,
            'dMd_meta': dMd_meta}