def hypergrad(outgrad):
        d_x = outgrad
        global v_current
        v = v_current
        d_alphas, d_gammas = np.zeros(len(alphas)), np.zeros(len(gammas))
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(gamma) output.

        beta = np.linspace(0.001, 0.999,
                           N_safe_sampling)  #evenly spaced, Michael

        for i, alpha, gamma in iters[::-1]:

            # Here is the averaging sequence, Michael
            x = (1 - beta[i]) * x_init + beta[i] * x_final

            x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final
            v = np.subtract(x, x_previous) / alpha  #recover velocity
            d_alphas[i] = np.dot(d_x, v)
            g = L_grad(x, meta, i)  # Evaluate gradient
            # v = (v+(1.0 - gamma)*g)/gamma
            d_v += d_x * alpha
            d_gammas[i] = np.dot(d_v, v + g)
            d_x -= (1.0 - gamma) * L_hvp_x(
                x, meta, d_v,
                i)  #DrMad paper forgot to mention this line, Michael
            d_meta -= (1.0 - gamma) * L_hvp_meta(x, meta, d_v, i)
            d_v *= gamma  #DrMad paper forgot to mention this line, Michael
        # assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_gammas, d_meta
Example #2
0
    def hypergrad(outgrad):
        d_x = outgrad
        global v_current
        v = v_current
        d_alphas, d_gammas = np.zeros(alphas.shape), np.zeros(gammas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        beta = np.linspace(0.001, 0.999, N_safe_sampling)
        for i, alpha, gamma in iters[::-1]:
            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_gamma_vect  = fill_parser(parser, gamma)

            x = (1 - beta[i]) * x_init + beta[i] * x_final
            x_previous = (1 - beta[i - 1]) * x_init + beta[i - 1] * x_final
            v = (np.subtract(x, x_previous)) / cur_alpha_vect  # recover velocity
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i,j] = np.dot(d_x[ixs], v[ixs])
            g = L_grad(x, meta, i)                           # Evaluate gradient

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_gammas[i,j] = np.dot(d_v[ixs], v[ixs] + g[ixs])

            d_x    -= L_hvp_x(x, meta, (1.0 - cur_gamma_vect)*d_v, i)
            d_meta -= L_hvp_meta(x, meta, (1.0 - cur_gamma_vect)* d_v, i)
            d_v    *= cur_gamma_vect
        # assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_gammas, d_meta
Example #3
0
    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)

        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)
        L_hvp_meta = grad(grad_proj, 1)

        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect = fill_parser(parser, beta)
            for j, (_, (ixs,
                        _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs])

            # Exactly reverse SGD
            X.sub(cur_alpha_vect * V.val)
            g = L_grad(X.val, meta, i)
            V.add(g).div(cur_beta_vect)

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs,
                        _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs])

            d_x -= L_hvp_x(X.val, meta, d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, d_v, i)
            d_v *= cur_beta_vect

        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta
Example #4
0
    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect = fill_parser(parser, beta)
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i, j] = np.dot(d_x[ixs], V.val[ixs])

            X.sub(cur_alpha_vect * V.val)  # Reverse position update
            g = L_grad(X.val, meta, i)  # Evaluate gradient
            V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect)  # Reverse momentum update

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i, j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs])

            d_x -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect) * d_v, i)
            d_v *= cur_beta_vect
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta
Example #5
0
def run():
    print "Running experiment..."
    sgd_optimized_points = []
    ed_optimized_points = []
    for i in xrange(N_samples):
        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        sgd_optimized_points.append(
            sgd(grad(nllfunt),
                x=x0,
                v=v0,
                learn_rate=alpha,
                decay=decay,
                iters=N_iter))

        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        ed_optimized_points.append(
            entropic_descent(grad(nllfunt),
                             x=x0,
                             v=v0,
                             learn_rate=alpha,
                             decay=decay,
                             iters=N_iter,
                             theta=theta,
                             rs=rs))
        entropy = np.log(decay) * D * N_iter

    return sgd_optimized_points, ed_optimized_points, entropy
def run():
    print "Running experiment..."
    sgd_optimized_points = []
    ed_optimized_points = []
    aed_optimized_points = []
    asgd_optimized_points = []
    for i in xrange(N_samples):
        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        sgd_optimized_points.append(
            sgd(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter))

        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        ed_optimized_points.append(
            entropic_descent(grad(nllfunt), x=x0, v=v0, learn_rate=alpha, decay=decay, iters=N_iter, theta=theta, rs=rs))
        entropy = np.log(decay) * D * N_iter

        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        aed_optimized_points.append(
            adaptive_entropic_descent(grad(nllfunt), x=x0, v=v0, init_learn_rate=alpha, init_log_decay=np.log(decay), meta_learn_rate=meta_alpha, meta_decay=meta_decay, iters=N_iter))

        rs = RandomState((seed, i))
        x0 = rs.randn(D) * x_init_scale
        v0 = rs.randn(D) * v_init_scale
        asgd_optimized_points.append(
            adaptive_sgd(grad(nllfunt), x=x0, v=v0, init_learn_rate=alpha, init_log_decay=np.log(decay), meta_learn_rate=meta_alpha, meta_decay=meta_decay, iters=N_iter))

    return sgd_optimized_points, ed_optimized_points, aed_optimized_points, asgd_optimized_points, entropy
Example #7
0
    def hypergrad(outgrad):
        d_x = outgrad
        d_alphas, d_betas = np.zeros(alphas.shape), np.zeros(betas.shape)
        d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
        grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
        L_hvp_x    = grad(grad_proj, 0)  # Returns a size(x) output.
        L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
        for i, alpha, beta in iters[::-1]:

            # build alpha and beta vector
            cur_alpha_vect = fill_parser(parser, alpha)
            cur_beta_vect  = fill_parser(parser, beta)
            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_alphas[i,j] = np.dot(d_x[ixs], V.val[ixs])

            X.sub(cur_alpha_vect * V.val)                        # Reverse position update
            g = L_grad(X.val, meta, i)                           # Evaluate gradient
            V.add((1.0 - cur_beta_vect) * g).div(cur_beta_vect)  # Reverse momentum update

            d_v += d_x * cur_alpha_vect

            for j, (_, (ixs, _)) in enumerate(parser.idxs_and_shapes.iteritems()):
                d_betas[i,j] = np.dot(d_v[ixs], V.val[ixs] + g[ixs])

            d_x    -= L_hvp_x(X.val, meta, (1.0 - cur_beta_vect)*d_v, i)
            d_meta -= L_hvp_meta(X.val, meta, (1.0 - cur_beta_vect)* d_v, i)
            d_v    *= cur_beta_vect
        assert np.all(ExactRep(x0).val == X.val)
        return d_x, d_alphas, d_betas, d_meta
Example #8
0
def test_sub():
    fun = lambda x, y : to_scalar(x - y)
    d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y))
    for arg1, arg2 in arg_pairs():
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #9
0
def test_sub():
    fun = lambda x, y: to_scalar(x - y)
    d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y))
    for arg1, arg2 in arg_pairs():
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #10
0
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False):
    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                                                        loss_fun(X.val, batches.all_idxs))
            
    X, V = ExactRep(x), ExactRep(v)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter/len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    loss_grad = grad(loss_fun)
    loss_hvp = grad(lambda x, d, idxs : np.dot(loss_grad(x, idxs), d))
    learning_curve = [loss_fun(x_orig, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = loss_grad(X.val, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        if record_learning_curve and (i+1) % iter_per_epoch == 0:
            learning_curve.append(loss_fun(X.val, batches.all_idxs))
        #print_perf()

    x_final = X.val
    d_x = loss_grad(X.val, batches.all_idxs)
    loss_final = loss_fun(x_final, batches.all_idxs)
    d_v = np.zeros(d_x.shape)
    d_alphas = deque()
    d_betas = deque()
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        print_perf()
        d_v += d_x * alpha
        X.sub(alpha * V.val)
        g = loss_grad(X.val, batch)
        d_alphas.appendleft(np.dot(d_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        d_betas.appendleft(np.dot(d_v, V.val + g))
        d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch)
        d_v = d_v * beta

    d_alphas = np.array(d_alphas)
    d_betas = np.array(d_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final'    : x_final,
            'learning_curve' : learning_curve,
            'loss_final' : loss_final,
            'd_x' : d_x,
            'd_v' : d_v,
            'd_alphas' : d_alphas,
            'd_betas'  : d_betas}
Example #11
0
def sgd(loss_fun, batches, N_iter, x, v, alphas, betas, record_learning_curve=False):
    # TODO: Warp alpha and beta to map from real-valued domains (exp and logistic?)
    def print_perf():
        pass
        if (i + 1) % iter_per_epoch == 0:
            print "End of epoch {0}: loss is {1}".format(i / iter_per_epoch,
                                                         loss_fun(X.val, batches.all_idxs))

    X, V = ExactRep(x), ExactRep(v)
    x_orig = X.val
    iter_per_epoch = len(batches)
    num_epochs = N_iter / len(batches) + 1
    iters = zip(range(N_iter), alphas, betas, batches * num_epochs)
    loss_grad = grad(loss_fun)
    loss_hvp = grad(lambda x, d, idxs: np.dot(loss_grad(x, idxs), d))
    learning_curve = [loss_fun(x_orig, batches.all_idxs)]
    for i, alpha, beta, batch in iters:
        V.mul(beta)
        g = loss_grad(X.val, batch)
        V.sub((1.0 - beta) * g)
        X.add(alpha * V.val)
        if record_learning_curve and (i + 1) % iter_per_epoch == 0:
            learning_curve.append(loss_fun(X.val, batches.all_idxs))
            # print_perf()

    x_final = X.val
    d_x = loss_grad(X.val, batches.all_idxs)
    loss_final = loss_fun(x_final, batches.all_idxs)
    d_v = np.zeros(d_x.shape)
    d_alphas = deque()
    d_betas = deque()
    print_perf()

    for i, alpha, beta, batch in iters[::-1]:
        print_perf()
        d_v += d_x * alpha
        X.sub(alpha * V.val)
        g = loss_grad(X.val, batch)
        d_alphas.appendleft(np.dot(d_x, V.val))
        V.add((1.0 - beta) * g)
        V.div(beta)
        d_betas.appendleft(np.dot(d_v, V.val + g))
        d_x = d_x - (1.0 - beta) * loss_hvp(X.val, d_v, batch)
        d_v = d_v * beta

    d_alphas = np.array(d_alphas)
    d_betas = np.array(d_betas)

    # print "-"*80
    assert np.all(x_orig == X.val)
    return {'x_final': x_final,
            'learning_curve': learning_curve,
            'loss_final': loss_final,
            'd_x': d_x,
            'd_v': d_v,
            'd_alphas': d_alphas,
            'd_betas': d_betas}
Example #12
0
def test_div():
    fun = lambda x, y: to_scalar(x / y)
    d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y))
    make_gap_from_zero = lambda x: np.sqrt(x**2 + 0.5)
    for arg1, arg2 in arg_pairs():
        arg1 = make_gap_from_zero(arg1)
        arg2 = make_gap_from_zero(arg2)
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #13
0
def test_div():
    fun = lambda x, y : to_scalar(x / y)
    d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y))
    make_gap_from_zero = lambda x : np.sqrt(x **2 + 0.5)
    for arg1, arg2 in arg_pairs():
        arg1 = make_gap_from_zero(arg1)
        arg2 = make_gap_from_zero(arg2)
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #14
0
def test_pow():
    fun = lambda x, y : to_scalar(x ** y)
    d_fun_0 = lambda x, y : to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y : to_scalar(grad(fun, 1)(x, y))
    make_positive = lambda x : np.abs(x) + 1.1 # Numeric derivatives fail near zero
    for arg1, arg2 in arg_pairs():
        arg1 = make_positive(arg1)
        arg2 = np.round(arg2)
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #15
0
def test_pow():
    fun = lambda x, y: to_scalar(x**y)
    d_fun_0 = lambda x, y: to_scalar(grad(fun, 0)(x, y))
    d_fun_1 = lambda x, y: to_scalar(grad(fun, 1)(x, y))
    make_positive = lambda x: np.abs(
        x) + 1.1  # Numeric derivatives fail near zero
    for arg1, arg2 in arg_pairs():
        arg1 = make_positive(arg1)
        arg2 = np.round(arg2)
        check_grads(fun, arg1, arg2)
        check_grads(d_fun_0, arg1, arg2)
        check_grads(d_fun_1, arg1, arg2)
Example #16
0
def test_hess_vector_prod():
    npr.seed(1)
    randv = npr.randn(10)
    def fun(x):
        return np.sin(np.dot(x, randv))
    df = grad(fun)
    def vector_product(x, v):
        return np.sin(np.dot(v, df(x)))
    ddf = grad(vector_product)
    A = npr.randn(10)
    B = npr.randn(10)
    check_grads(fun, A)
    check_grads(vector_product, A, B)
def sgd3(optimizing_loss,
         secondary_loss,
         x0,
         v0,
         alphas,
         betas,
         meta,
         callback=None):
    """Same as sgd2 but simplifies things by not bothering with grads of
    optimizing loss (can always just pass that in as the secondary loss)"""
    X, V = ExactRep(x0), ExactRep(v0)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
    L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
    L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
    iters = zip(range(len(alphas)), alphas, betas)
    for i, alpha, beta in iters:
        if callback: callback(X.val, i)
        g = L_grad(X.val, meta, i)
        V.mul(beta).sub((1.0 - beta) * g)
        X.add(alpha * V.val)
    x_final = X.val
    M_grad = grad(secondary_loss, 0)  # Gradient wrt parameters.
    M_meta_grad = grad(secondary_loss, 1)  # Gradient wrt metaparameters.
    dMd_x = M_grad(X.val, meta)
    dMd_v = np.zeros(dMd_x.shape)
    dMd_alphas = deque()
    dMd_betas = deque()
    dMd_meta = M_meta_grad(X.val, meta)
    for i, alpha, beta in iters[::-1]:
        dMd_alphas.appendleft(np.dot(dMd_x, V.val))
        X.sub(alpha * V.val)
        g = L_grad(X.val, meta, i)
        V.add((1.0 - beta) * g).div(beta)
        dMd_v += dMd_x * alpha
        dMd_betas.appendleft(np.dot(dMd_v, V.val + g))
        dMd_x -= (1.0 - beta) * L_hvp_x(X.val, meta, dMd_v, i)
        dMd_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, dMd_v, i)
        dMd_v *= beta

    assert np.all(ExactRep(x0).val == X.val)
    return {
        'x_final': x_final,
        'dMd_x': dMd_x,
        'dMd_v': dMd_v,
        'dMd_alphas': dMd_alphas,
        'dMd_betas': dMd_betas,
        'dMd_meta': dMd_meta
    }
 def hypergrad(outgrad):
     d_x = outgrad
     d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
     grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
     L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
     L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
     beta = np.linspace(0.001,0.999,N_safe_sampling)
     for i in range(N_safe_sampling)[::-1]:
         x_current = (1-beta[i])*x_init + beta[i]*x_final
         d_v += d_x * alpha
         d_x -= (1.0 - gamma) * L_hvp_x(x_current, meta, d_v, i)
         d_meta -= (1.0 - gamma) * L_hvp_meta(x_current, meta, d_v, i)
         d_v *= gamma
     # assert np.all(ExactRep(x0).val == X.val)
     return d_meta
Example #19
0
 def hypergrad(outgrad):
     d_x = outgrad
     d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
     grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
     L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
     L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
     beta = np.linspace(0.001,0.999,N_safe_sampling)
     for i in range(N_safe_sampling)[::-1]:
         x_current = (1-beta[i])*x_init + beta[i]*x_final
         d_v += d_x * alpha
         d_x -= (1.0 - gamma) * L_hvp_x(x_current, meta, d_v, i)
         d_meta -= (1.0 - gamma) * L_hvp_meta(x_current, meta, d_v, i)
         d_v *= gamma
     # assert np.all(ExactRep(x0).val == X.val)
     return d_meta
Example #20
0
def build_lstm(seq_width, state_size, output_size, l2_penalty=0.0):
    parser = VectorParser()
    parser.add_shape('change', (seq_width + state_size + 1, state_size))
    parser.add_shape('gate',   (seq_width + state_size + 1, state_size))
    parser.add_shape('keep',   (seq_width + state_size + 1, state_size))
    parser.add_shape('output', (state_size, output_size))

    def update_lstm(input, state, change_weights, gate_weights, keep_weights):
        """One iteration of an LSTM layer without an output."""
        change = activations(input, state, change_weights)
        gate   = activations(input, state, gate_weights)
        keep   = activations(input, state, keep_weights)
        return state * keep + gate * change

    def compute_hiddens(weights_vect, seqs):
        """Goes from right to left, updating the state."""
        weights = parser.new_vect(weights_vect)
        num_seqs = seqs.shape[1]
        state = np.zeros((num_seqs, state_size))
        for cur_input in seqs:  # Iterate over time steps.
            state = update_lstm(cur_input, state,
                                weights['change'], weights['gate'], weights['keep'])
        return state

    def predictions(weights_vect, seqs):
        weights = parser.new_vect(weights_vect)
        return np.dot(compute_hiddens(weights_vect, seqs), weights['output'])

    def loss(weights, seqs, targets):
        log_lik = -np.sum((predictions(weights, seqs) - targets)**2)
        log_prior = -l2_penalty * np.dot(weights, weights)
        return (-log_prior - log_lik) / targets.shape[0]

    return loss, grad(loss), predictions, compute_hiddens, parser
Example #21
0
    def hyperloss(hyperparam_vect,
                  i_hyper,
                  alphabets,
                  verbose=True,
                  report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))

        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 10 == 0:
                print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
            return loss

        W0 = RS.randn(N_weights) * initialization_scale
        W_final = sgd(grad(primal_loss),
                      hyperparam_vect,
                      W0,
                      alpha,
                      beta,
                      N_iters,
                      callback=None)
        return reg_loss_fun(W_final,
                            valid_data,
                            hyperparam_vect,
                            reg_penalty=False)
def run():
    train_data, valid_data, test_data = load_data_subset(N_train, N_valid, N_test)
    kernel = make_sq_exp_kernel(L0)
    def loss_fun(transform, train_data, valid_data):
        train_data = augment_data(train_data, transform)
        return weighted_neighbors_loss(train_data, valid_data, kernel)
    loss_grad = batchwise_function(grad(loss_fun))
    loss_fun  = batchwise_function(loss_fun)

    batch_idxs = BatchList(N_valid, batch_size)
    A = np.eye(N_pix)
    valid_losses = [loss_fun(A, train_data, valid_data)]
    test_losses  = [loss_fun(A, train_data,  test_data)]
    A += A_init_scale * npr.randn(N_pix, N_pix)
    for meta_iter in range(N_meta_iters):
        print "Iter {0} valid {1} test {2}".format(
            meta_iter, valid_losses[-1], test_losses[-1])

        for idxs in batch_idxs:
            valid_batch = [x[idxs] for x in valid_data]
            d_A        = loss_grad(A, train_data, valid_batch)
            A -= meta_alpha * (d_A + meta_L1 * np.sign(A))
        valid_losses.append(loss_fun(A, train_data, valid_data))
        test_losses.append( loss_fun(A, train_data, test_data))

    return A, valid_losses, test_losses
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data))
                learning_curve_dict["grad_norm"].append(np.linalg.norm(g))
                learning_curve_dict["weight_norm"].append(np.linalg.norm(x))
                learning_curve_dict["velocity_norm"].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"]))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams["log_alphas"])
        betas = logit(cur_hyperparams["invlogit_betas"])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"]))
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        # callback(W_opt, N_iters)
        return W_opt, learning_curve_dict
Example #24
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0: # N_batches=10 times
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        # TODO: why doesn't the following line work with N_iter=1?
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        # TODO: Put on proper scale; no SGD on log/invlogit scale
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        
        # TODO: check this
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback)
        #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        #callback(W_opt, N_iters)
        return W_opt, learning_curve_dict
Example #25
0
def sgd3_naive(optimizing_loss,
               x,
               v,
               alphas,
               betas,
               meta,
               fwd_callback=None,
               reverse_callback=None):
    """Same as sgd2 but simplifies things by not bothering with grads of
    optimizing loss (can always just pass that in as the secondary loss)"""
    x = x.astype(np.float16)
    v = v.astype(np.float16)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    iters = zip(range(len(alphas)), alphas, betas)

    # Forward pass
    for i, alpha, beta in iters:
        if fwd_callback: fwd_callback(x, i)
        g = L_grad(x, meta, i)
        v = v * beta
        v = v - ((1.0 - beta) * g)
        x = x + alpha * v
        x = x.astype(np.float16)
        v = v.astype(np.float16)

    # Reverse pass
    for i, alpha, beta in iters[::-1]:
        x = x - alpha * v
        g = L_grad(x, meta, i)
        v = v + (1.0 - beta) * g
        v = v / beta
        if reverse_callback: reverse_callback(x, i)
        x = x.astype(np.float16)
        v = v.astype(np.float16)
Example #26
0
def sgd3_naive(optimizing_loss, x, v, alphas, betas, meta, fwd_callback=None, reverse_callback=None):
    """Same as sgd2 but simplifies things by not bothering with grads of
    optimizing loss (can always just pass that in as the secondary loss)"""
    x = x.astype(np.float16)
    v = v.astype(np.float16)
    L_grad = grad(optimizing_loss)  # Gradient wrt parameters.
    iters = zip(range(len(alphas)), alphas, betas)

    # Forward pass
    for i, alpha, beta in iters:
        if fwd_callback:
            fwd_callback(x, i)
        g = L_grad(x, meta, i)
        v = v * beta
        v = v - ((1.0 - beta) * g)
        x = x + alpha * v
        x = x.astype(np.float16)
        v = v.astype(np.float16)

    # Reverse pass
    for i, alpha, beta in iters[::-1]:
        x = x - alpha * v
        g = L_grad(x, meta, i)
        v = v + (1.0 - beta) * g
        v = v / beta
        if reverse_callback:
            reverse_callback(x, i)
        x = x.astype(np.float16)
        v = v.astype(np.float16)
Example #27
0
    def hyperloss(hyperparam_vect, i):
        learning_curve = []

        def callback(x, i):
            if i % len(batch_idxs) == 0:
                learning_curve.append(
                    loss_fun(x, X=train_images, T=train_labels))

        npr.seed(i)
        N_weights = parser.vect.size
        V0 = np.zeros(N_weights)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        layer_param_scale = [
            np.full(parser[name].size,
                    np.exp(cur_hyperparams['log_param_scale'][i]))
            for i, name in enumerate(parser.names)
        ]
        W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        log_L2_reg = cur_hyperparams['log_L2_reg']
        W_opt = sgd5(grad(indexed_loss_fun),
                     kylist(W0, alphas, betas, log_L2_reg), callback)
        all_x.append(getval(W_opt))
        all_learning_curves.append(learning_curve)
        return valid_loss_fun(W_opt)
Example #28
0
 def hypergrad(outgrad):
     d_x = outgrad
     d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
     grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
     L_hvp_x = grad(grad_proj, 0)  # Returns a size(x) output.
     L_hvp_meta = grad(grad_proj, 1)  # Returns a size(meta) output.
     for i in range(N_iters)[::-1]:
         X.sub(alpha * V.val)  # Reverse position update
         g = L_grad(X.val, meta, i)  # Evaluate gradient
         V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
         d_v += d_x * alpha
         d_x -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
         d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
         d_v *= beta
     assert np.all(ExactRep(x0).val == X.val)
     return d_meta
Example #29
0
    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data,
                                       tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(
                    i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform
Example #30
0
def adam(grad,
         x,
         callback=None,
         num_iters=100,
         step_size=0.1,
         b1=0.1,
         b2=0.01,
         eps=10**-4,
         lam=10**-4):
    m = np.zeros(len(x))
    v = np.zeros(len(x))
    for i in xrange(num_iters):
        b1t = 1 - (1 - b1) * (lam**i)
        g = grad(x, i)

        if callback:
            callback(x, i, g)

        m = b1t * g + (1 - b1t) * m
        v = b2 * (g**2) + (1 - b2) * v
        mhat = m / (1 - (1 - b1)**(i + 1))
        vhat = v / (1 - (1 - b2)**(i + 1))
        x -= step_size * mhat / (np.sqrt(vhat) + eps)

    return x
Example #31
0
def primal_optimizer(hyperparams_vect, meta_epoch):
    def indexed_loss_fun(w, L2_vect, i_iter):
        rs = RandomState(
            (seed, meta_epoch,
             i_iter))  # Deterministic seed needed for backwards pass.
        idxs = rs.randint(N_train, size=batch_size)
        return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                        L2_vect)

    cur_hyperparams = hyperparams.new_vect(hyperparams_vect)

    rs = RandomState((seed, meta_epoch))

    # Randomly initialize weights
    W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
    W0 *= rs.randn(W0.size)
    # Init regularization term
    L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
    # Set step sizes
    alphas = np.exp(cur_hyperparams['log_alphas'])
    # Momentum terms
    betas = logit(cur_hyperparams['invlogit_betas'])

    # Train model
    W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas,
                                                      L2_reg), parser)

    cur_primal_results['weights'] = getval(W_opt).copy()
    return W_opt
Example #32
0
def test_concatenate_axis_1():
    A = npr.randn(5, 6, 4)
    B = npr.randn(5, 6, 4)
    def fun(x): return to_scalar(np.concatenate((B, x, B), axis=1))
    d_fun = lambda x : to_scalar(grad(fun)(x))
    check_grads(fun, A)
    check_grads(d_fun, A)
Example #33
0
def run():
    (train_images, train_labels),\
    (tests_images, tests_labels) = load_data_subset(N_train, N_tests)
    parser, pred_fun, nllfun, frac_err = make_nn_funs(layer_sizes, L2_per_dpt)
    N_param = len(parser.vect)

    print "Running experiment..."
    results = defaultdict(list)
    for i in xrange(N_samples):
        x_init_scale = np.full(N_param, init_scale)

        def indexed_loss_fun(w, i_iter):
            rs = RandomState((seed, i, i_iter))
            idxs = rs.randint(N_train, size=batch_size)
            return nllfun(w, train_images[idxs], train_labels[idxs]) * N_train
        gradfun = grad(indexed_loss_fun)

        def callback(x, t, v, entropy):
            results[("entropy", i)].append(entropy / N_train)
            results[("v_norm", i)].append(norm(v) / np.sqrt(N_param))
            results[("minibatch_likelihood", i)].append(-indexed_loss_fun(x, t))
            if t % thin != 0 and t != N_iter and t != 0: return
            results[('iterations', i)].append(t)
            results[("train_likelihood", i)].append(-nllfun(x, train_images, train_labels))
            results[("tests_likelihood", i)].append(-nllfun(x, tests_images, tests_labels))
            results[("tests_error", i)].append(frac_err(x, tests_images, tests_labels))
            print "Iteration {0:5} Train likelihood {1:2.4f}  Test likelihood {2:2.4f}" \
                  "  Test Err {3:2.4f}".format(t, results[("train_likelihood", i)][-1],
                                                  results[("tests_likelihood", i)][-1],
                                                  results[("tests_error",      i)][-1])
        rs = RandomState((seed, i))
        entropic_descent2(gradfun, callback=callback, x_scale=x_init_scale,
                          epsilon=epsilon, gamma=gamma, alpha=alpha,
                          annealing_schedule=annealing_schedule, rs=rs)
    return results
Example #34
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                print "Cur_reg", cur_reg
                # print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            # print("calculate hypergradients")
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            # print "constrained_grad",constrained_grad
            print "\n"
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha
            # cur_reg -= np.sign(constrained_grad) * meta_alpha

        return cur_reg
Example #36
0
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            cur_reg -= np.sign(constrained_grad) * meta_alpha
        return cur_reg
Example #37
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))
                learning_curve_dict['iteration'].append(i_iter + 1)
                print "iteration", i_iter

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict
Example #38
0
def test_adam():
    N_weights = 5
    W0 = 0.1 * npr.randn(N_weights)
    (loss_fun, true_argmin) = make_optimization_problem(N_weights)
    x_min = adam(grad(loss_fun), W0)
    assert np.allclose(x_min, true_argmin, rtol=1e-3, atol=1e-4), \
        "Diffs are: {0}".format(x_min - true_argmin)
Example #39
0
    def hyperloss(transform_vect, i_hyper, record_results=False):
        def primal_stochastic_loss(z_vect, transform_vect, i_primal):
            RS = RandomState((seed, i_hyper, i_primal))
            loss = 0.0
            for _ in range(N_scripts_per_iter):
                i_script = RS.randint(N_scripts)
                N_train = train_data[i_script]["X"].shape[0]
                idxs = RS.permutation(N_train)[:batch_size]
                minibatch = dictslice(train_data[i_script], idxs)
                loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch)
            loss /= N_scripts_per_iter
            reg = regularization(z_vect)
            # if i_primal % 10 == 0:
            #     print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg))
            #     print "Full losses: train: {0}, valid: {1}".format(
            #         total_loss(train_data, getval(z_vect)),
            #         total_loss(valid_data, getval(z_vect)))
            return loss + reg

        def total_loss(data, z_vect):
            return np.mean(
                [loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts)]
            )

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None)
        valid_loss = total_loss(valid_data, z_vect_final)
        if record_results:
            results["valid_loss"].append(valid_loss)
            results["train_loss"].append(total_loss(train_data, z_vect_final))
            # results['tests_loss'].append(total_loss(tests_data, z_vect_final))
        return valid_loss
Example #40
0
 def hypergrad(outgrad):
     d_x = outgrad
     d_v, d_meta = np.zeros(d_x.shape), np.zeros(meta.shape)
     grad_proj = lambda x, meta, d, i: np.dot(L_grad(x, meta, i), d)
     L_hvp_x    = grad(grad_proj, 0) # Returns a size(x) output.
     L_hvp_meta = grad(grad_proj, 1) # Returns a size(meta) output.
     for i in range(N_iters)[::-1]:
         X.sub(alpha * V.val)               # Reverse position update
         g = L_grad(X.val, meta, i)         # Evaluate gradient
         V.add((1.0 - beta) * g).div(beta)  # Reverse momentum update
         d_v += d_x * alpha
         d_x    -= (1.0 - beta) * L_hvp_x(X.val, meta, d_v, i)
         d_meta -= (1.0 - beta) * L_hvp_meta(X.val, meta, d_v, i)
         d_v    *= beta
     assert np.all(ExactRep(x0).val == X.val)
     return d_meta
Example #41
0
def test_sign():
    fun = lambda x : 3.0 * np.sign(x)
    d_fun = grad(fun)
    check_grads(fun, 1.1)
    check_grads(fun, -1.1)
    check_grads(d_fun, 1.1)
    check_grads(d_fun, -1.1)
Example #42
0
def test_abs():
    fun = lambda x : 3.0 * np.abs(x)
    d_fun = grad(fun)
    check_grads(fun, 1.1)
    check_grads(fun, -1.1)
    check_grads(d_fun, 1.1)
    check_grads(d_fun, -1.1)
Example #43
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(
                npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve = []

        def callback(x, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                     callback)
        callback(W_opt, N_iters)
        return W_opt, learning_curve
Example #44
0
def test_sgd_parser():
    N_weights = 6
    W0 = 0.1 * npr.randn(N_weights)
    N_data = 12
    batch_size = 4
    num_epochs = 4
    batch_idxs = BatchList(N_data, batch_size)

    parser = VectorParser()
    parser.add_shape('first',  [2,])
    parser.add_shape('second', [1,])
    parser.add_shape('third',  [3,])
    N_weight_types = 3

    alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types)
    betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types)
    meta = 0.1 * npr.randn(N_weights*2)

    A = npr.randn(N_data, N_weights)
    def loss_fun(W, meta, i=None):
        idxs = batch_idxs.all_idxs if i is None else batch_idxs[i % len(batch_idxs)]
        sub_A = A[idxs, :]
        return np.dot(np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W)

    def full_loss(params):
        (W0, alphas, betas, meta) = params
        result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser)
        return loss_fun(result, meta)

    d_num = nd(full_loss, (W0, alphas, betas, meta))
    d_an_fun = grad(full_loss)
    d_an = d_an_fun([W0, alphas, betas, meta])
    for i, (an, num) in enumerate(zip(d_an, d_num[0])):
        assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \
            "Type {0}, diffs are: {1}".format(i, an - num)
Example #45
0
def run():
    train_data, valid_data, test_data = load_data_subset(
        N_train, N_valid, N_test)
    kernel = make_sq_exp_kernel(L0)

    def loss_fun(transform, train_data, valid_data):
        train_data = augment_data(train_data, transform)
        return weighted_neighbors_loss(train_data, valid_data, kernel)

    loss_grad = batchwise_function(grad(loss_fun))
    loss_fun = batchwise_function(loss_fun)

    batch_idxs = BatchList(N_valid, batch_size)
    A = np.eye(N_pix)
    valid_losses = [loss_fun(A, train_data, valid_data)]
    test_losses = [loss_fun(A, train_data, test_data)]
    A += A_init_scale * npr.randn(N_pix, N_pix)
    for meta_iter in range(N_meta_iters):
        print "Iter {0} valid {1} test {2}".format(meta_iter, valid_losses[-1],
                                                   test_losses[-1])

        for idxs in batch_idxs:
            valid_batch = [x[idxs] for x in valid_data]
            d_A = loss_grad(A, train_data, valid_batch)
            A -= meta_alpha * (d_A + meta_L1 * np.sign(A))
        valid_losses.append(loss_fun(A, train_data, valid_data))
        test_losses.append(loss_fun(A, train_data, test_data))

    return A, valid_losses, test_losses
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
            # fraction_error = frac_err(w_vect_final,**cur_valid_data)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        #reg is the list of hyperparameters
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                # print "Cur_reg", np.mean(cur_reg)
                print "Cur_reg", cur_reg

            for client_i in range (0,clientNum):

                RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
                cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid])
                # print("calculate hypergradients")
                raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
                constrained_grad = constrain_reg(w_parser, raw_grad, constraint)


                # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
                # cur_reg -= constrained_grad * meta_alpha/clientNum
                cur_reg -= np.sign(constrained_grad) * meta_alpha/clientNum
            print "\n"
            # print "constrained_grad",constrained_grad
        return cur_reg
Example #47
0
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            cur_reg -= np.sign(constrained_grad) * meta_alpha
        return cur_reg
Example #48
0
    def hyperloss(transform_vect, i_hyper, record_results=False):
        def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script):
            RS = RandomState((seed, i_hyper, i_primal, i_script))
            N_train = train_data[i_script]['X'].shape[0]
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data[i_script], idxs)
            loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch)
            if i_primal % N_thin == 0 and i_script == 0:
                print "Iter {0}, full losses: train: {1}, valid: {2}".format(
                    i_primal,
                    total_loss(train_data, getval(z_vect)),
                    total_loss(valid_data, getval(z_vect)))
            if i_script == 0: # Only add regularization once
                loss += regularization(z_vect)

            return loss

        def total_loss(data, z_vect):
            return np.mean([loss_from_latents(z_vect, transform_vect, i_script, data[i_script])
                            for i_script in range(N_scripts)])

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(sub_primal_stochastic_loss), transform_vect, z_vect_0,
                           alpha, beta, N_iters, N_scripts_per_iter, callback=None)
        valid_loss = total_loss(valid_data, z_vect_final)
        if record_results:
            results['valid_loss'].append(valid_loss)
            results['train_loss'].append(total_loss(train_data, z_vect_final))
            # results['tests_loss'].append(total_loss(tests_data, z_vect_final))
        return valid_loss
Example #49
0
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
        #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))


        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas  = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)
    #fixed_hyperparams = VectorParser()
    #fixed_hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve = []
        def callback(x, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        V0 = np.zeros(W0.size)
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        return W_opt, learning_curve

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        # return loss_fun(W_opt, **valid_data)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    def meta_callback(hyperparam_vect, i_hyper):
        print "Meta Epoch {0}".format(i_hyper)
        x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect,
                            meta_callback, N_meta_iter, meta_alpha, gamma=0.0)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Example #51
0
def test_concatenate_axis_1_unnamed():
    """Tests whether you can specify the axis without saying "axis=1"."""
    A = npr.randn(5, 6, 4)
    B = npr.randn(5, 6, 4)
    def fun(x): return to_scalar(np.concatenate((B, x, B), 1))
    d_fun = lambda x : to_scalar(grad(fun)(x))
    check_grads(fun, A)
    check_grads(d_fun, A)
Example #52
0
def test_index_multiple_slices():
    A = npr.randn(7)
    def fun(x):
        y = x[2:6]
        z = y[1:3]
        return to_scalar(z)
    d_fun = lambda x : to_scalar(grad(fun)(x))
    check_grads(fun, A)
    check_grads(d_fun, A)
Example #53
0
def test_index_slice_fanout():
    A = npr.randn(5, 6, 4)
    def fun(x):
        y = x[::-1, 2:4, :]
        z = x[::-1, 3:5, :]
        return to_scalar(y + z)
    d_fun = lambda x : to_scalar(grad(fun)(x))
    check_grads(fun, A)
    check_grads(d_fun, A)