Ejemplo n.º 1
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad( hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad( hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad
Ejemplo n.º 2
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    N_weights = len(parser.vect)
    hyperparams = VectorParser()
    rs = RandomState((seed))
    hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\
                              + rs.randn(N_weights) * init_log_L2_reg_noise
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)

    cur_primal_results = {}

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = np.exp(cur_hyperparams['log_L2_reg'])
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        x, learning_curve_dict = cur_primal_results[
            'weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field] = cur_hyperparams[field]
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, parsed_init_hypergrad
Ejemplo n.º 3
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])


    initial_hypergrad = hyperloss_grad( hyperparams.vect, 0)
    hypergrads = np.zeros((N_meta_iter, len(initial_hypergrad)))
    for i in xrange(N_meta_iter):
        hypergrads[i] = hyperloss_grad( hyperparams.vect, i)
        print i
    avg_hypergrad = np.mean(hypergrads, axis=0)
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None # No need to pickle zeros
    return parser, parsed_avg_hypergrad
Ejemplo n.º 4
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas  = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Average many gradient evaluations at the initial point.
    hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size))
    for i in xrange(N_gradients_in_average):
        hypergrads[i] = hyperloss_grad(hyperparams.vect, i)
        print i
    first_gradient = hypergrads[0]
    avg_gradient = np.mean(hypergrads, axis=0)

    # Now do a line search along that direction.
    parsed_avg_grad = hyperparams.new_vect(avg_gradient)
    stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel()))
    stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search)
    for i, stepsize in enumerate(stepsizes):
        cur_hypervect = hyperparams.vect - stepsize * avg_gradient
        meta_callback(cur_hypervect, 0)   # Use the same random seed every time.

    parser.vect = None # No need to pickle zeros
    return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
Ejemplo n.º 5
0
    meta_results['example_weights'] = x
    
    if metagrad is not None:
        meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
        meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(old_metagrad[0])))
    old_metagrad[0] = metagrad
    print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
          " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
        i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
        meta_results['train_loss'][-1], meta_results['test_err'][-1])

initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)

parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
meta_callback(final_result, N_meta_iter)
parser.vect = None # No need to pickle zeros

results = (meta_results, parser, parsed_init_hypergrad)
pickle.dump(results, open('results.pkl', 'w'))

# --
# Plot

results, parser, parsed_init_hypergrad = pickle.load(open('bak/results.pkl'))

fig = plt.figure(0)
fig.clf()
ax = fig.add_subplot(111)
Ejemplo n.º 6
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Average many gradient evaluations at the initial point.
    hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size))
    for i in xrange(N_gradients_in_average):
        hypergrads[i] = hyperloss_grad(hyperparams.vect, i)
        print i
    first_gradient = hypergrads[0]
    avg_gradient = np.mean(hypergrads, axis=0)

    # Now do a line search along that direction.
    parsed_avg_grad = hyperparams.new_vect(avg_gradient)
    stepsize_scale = 1000. / np.max(
        np.exp(parsed_avg_grad['log_alphas'].ravel()))
    stepsizes = np.linspace(-stepsize_scale, stepsize_scale,
                            N_points_in_line_search)
    for i, stepsize in enumerate(stepsizes):
        cur_hypervect = hyperparams.vect + stepsize * avg_gradient
        meta_callback(cur_hypervect, 0)  # Use the same random seed every time.

    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, first_gradient, avg_gradient, stepsizes
Ejemplo n.º 7
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad(hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None  # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad