Beispiel #1
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            seed = i_hyper * 10**6 + i_iter
            idxs = npr.RandomState(seed).randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve = []

        def callback(x, v, g, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        V0 = np.zeros(W0.size)
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                     callback)
        return W_opt, learning_curve

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)

    def meta_callback(hyperparam_vect, i_hyper, g):
        print "Epoch {0}".format(i_hyper)
        x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback,
                            N_meta_iter, meta_alpha)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)
    #fixed_hyperparams = VectorParser()
    #fixed_hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve = []
        def callback(x, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        V0 = np.zeros(W0.size)
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        return W_opt, learning_curve

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        # return loss_fun(W_opt, **valid_data)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    def meta_callback(hyperparam_vect, i_hyper):
        print "Meta Epoch {0}".format(i_hyper)
        x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect,
                            meta_callback, N_meta_iter, meta_alpha, gamma=0.0)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad( hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad( hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad
Beispiel #4
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    def build_hypervect(init_log_alphas, init_invlogit_betas,
                        init_log_param_scale):
        hyperparams = VectorParser()
        hyperparams['log_param_scale'] = np.full(N_weight_types,
                                                 init_log_param_scale)
        hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                            init_log_alphas)
        hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                                init_invlogit_betas)
        return hyperparams

    hyperparams = build_hypervect(
        init_log_alphas, init_invlogit_betas,
        init_log_param_scale)  # Build just for parser.
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def whetlab_optimize(loss, max_iters, callback):
        for i in xrange(max_iters):
            params = scientist.suggest()
            hyperparams = build_hypervect(**params)
            cur_loss = loss(hyperparams.vect, 0)  # No randomness
            scientist.update(params, -cur_loss)
            if callback: callback(hyperparams.vect, 0)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    whetlab_optimize(hyperloss, N_meta_iter, meta_callback)
    best_params = scientist.best()
    print "best params:", best_params

    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, best_params
Beispiel #5
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))
                learning_curve_dict['iteration'].append(i_iter + 1)
                print "iteration", i_iter

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    meta_callback(hyperparams.vect, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Beispiel #6
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0 or i_iter == 0 or i_iter == N_iters - 1:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))
                learning_curve_dict['iteration'].append(i_iter)

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas  = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Now do a line search along that direction.


    log_stepsizes = np.linspace(-init_log_alphas, init_log_alphas*4, N_points_in_line_search)

    for log_stepsizes in log_stepsizes:
        hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), log_stepsizes)
        meta_callback(hyperparams.vect, 0)   # Use the same random seed every time.

    parser.vect = None # No need to pickle zeros
    return meta_results, parser, log_stepsizes
Beispiel #7
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    N_weights = len(parser.vect)
    hyperparams = VectorParser()
    rs = RandomState((seed))
    hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\
                              + rs.randn(N_weights) * init_log_L2_reg_noise
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)

    cur_primal_results = {}

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = np.exp(cur_hyperparams['log_L2_reg'])
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        x, learning_curve_dict = cur_primal_results[
            'weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field] = cur_hyperparams[field]
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, parsed_init_hypergrad
Beispiel #8
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_iters,
                                                   init_log_param_scale)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(
                npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % N_batches == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                     callback)
        #callback(W_opt, N_iters)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)

    def meta_callback(hyperparam_vect, i_hyper):
        print "Meta Epoch {0}".format(i_hyper)
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)

    final_result = rms_prop(hyperloss_grad,
                            hyperparams.vect,
                            meta_callback,
                            N_meta_iter,
                            meta_alpha,
                            gamma=0.0)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams["log_param_scale"] = np.full(N_weight_types, init_log_param_scale)
    hyperparams["log_alphas"] = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams["invlogit_betas"] = np.full((N_iters, N_weight_types), init_invlogit_betas)
    for name in parser.names:
        hyperparams[("rescale", name)] = np.full(N_iters, init_rescales)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams["log_L2_reg"] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data))
                learning_curve_dict["grad_norm"].append(np.linalg.norm(g))
                learning_curve_dict["weight_norm"].append(np.linalg.norm(x))
                learning_curve_dict["velocity_norm"].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"]))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams["log_alphas"])
        betas = logit(cur_hyperparams["invlogit_betas"])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"]))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results["train_loss"].append(loss_fun(x, **train_data))
        meta_results["valid_loss"].append(loss_fun(x, **valid_data))
        meta_results["tests_loss"].append(loss_fun(x, **tests_data))
        meta_results["test_err"].append(frac_err(x, **tests_data))
        meta_results["learning_curves"].append(learning_curve_dict)
        if metagrad is not None:
            meta_results["meta_grad_magnitude"].append(np.linalg.norm(metagrad))
            meta_results["meta_grad_angle"].append(
                np.dot(old_metagrad[0], metagrad) / (np.linalg.norm(metagrad) * np.linalg.norm(old_metagrad[0]))
            )
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper,
            meta_results["train_loss"][-1],
            meta_results["valid_loss"][-1],
            meta_results["train_loss"][-1],
            meta_results["test_err"][-1],
        )

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Beispiel #10
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas  = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Average many gradient evaluations at the initial point.
    hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size))
    for i in xrange(N_gradients_in_average):
        hypergrads[i] = hyperloss_grad(hyperparams.vect, i)
        print i
    first_gradient = hypergrads[0]
    avg_gradient = np.mean(hypergrads, axis=0)

    # Now do a line search along that direction.
    parsed_avg_grad = hyperparams.new_vect(avg_gradient)
    stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel()))
    stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search)
    for i, stepsize in enumerate(stepsizes):
        cur_hypervect = hyperparams.vect - stepsize * avg_gradient
        meta_callback(cur_hypervect, 0)   # Use the same random seed every time.

    parser.vect = None # No need to pickle zeros
    return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
Beispiel #11
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    rs = RandomState((seed))
    init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale
    one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
    fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes)  # One of each.

    hyperparams = VectorParser()
    hyperparams['fake_data']  = init_fake_data
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    fixed_hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    fixed_hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    loss_meta_parser = VectorParser()
    loss_meta_parser['']

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
        #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))


        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas  = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
        meta_results['train_loss'].append(0)
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            print metagrad
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Beispiel #12
0
meta_alpha  = 0.04
N_meta_iter = 50
seed = 0

# --
# Helpers

def fill_parser(parser, items):
    partial_vects = [np.full(parser[name].size, items[i]) for i, name in enumerate(parser.names)]
    return np.concatenate(partial_vects, axis=0)

# --
# Run

train_data, valid_data, test_data = load_data_dicts(N_train, N_valid, N_tests)

parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)

N_weight_types = len(parser.names)

hyperparams = VectorParser()
hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)

fixed_hyperparams = VectorParser()
fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

cur_primal_results = {}
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) #only uses two different regularization hyperparameters, one for each layer?
    N_weight_types = len(parser.names) # = 2
    print(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale']  = np.full(N_iters, init_log_param_scale) #don't update scale
    #TODO: remove scale from gradient, then?
    
    exact_metagrad = VectorParser()
    exact_metagrad['log_L2_reg']      = fill_parser(parser, hyperparams['log_L2_reg']) #np.zeros(N_weight_types)
    exact_metagrad['log_param_scale'] = fill_parser(parser, fixed_hyperparams['log_param_scale']) #np.zeros(N_weight_types)
    exact_metagrad['log_alphas']      = np.zeros(N_iters)
    exact_metagrad['invlogit_betas']  = np.zeros(N_iters)
    
    exact_metagrad2 = VectorParser()
    exact_metagrad2['log_L2_reg']      = np.zeros(N_weight_types)
    exact_metagrad2['log_param_scale'] = np.zeros(N_weight_types)
    exact_metagrad2['log_alphas']      = np.zeros(N_iters)
    exact_metagrad2['invlogit_betas']  = np.zeros(N_iters)
    
    #exact_metagrad = exact_metagradV.vect
    #print(hyperparams.vect)
    #exact_metagrad = [np.zeros(N_weight_types), np.zeros(N_weight_types), np.zeros(N_iters), np.zeros(N_iters)] #initialize

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0: # N_batches=10 times
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        # TODO: why doesn't the following line work with N_iter=1?
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        # TODO: Put on proper scale; no SGD on log/invlogit scale
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        
        # TODO: check this
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback)
        #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        #callback(W_opt, N_iters)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)
    # TODO: This is where the chain rule happens, dhyperloss/dW_opt x dW_opt/dhyperparam_vect; first term is SGD

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    #def meta_callback(hyperparam_vect, i_hyper, metagrad):
    def meta_callback(hyperparam_vect, i_hyper, metagrad, exact_metagrad=exact_metagrad):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        # these are the unregularized losses below; default sets L2_reg=0.0
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['train_err'].append(frac_err(x, **train_data))
        meta_results['valid_err'].append(frac_err(x, **valid_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        print("metagrad", len(metagrad))
        meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
        meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(old_metagrad[0])))
        #Michael: added comparisons with exact metagrad here
        #(2) Angle condition:  More strongly, is the cosine of the angle between the two strictly bounded away from 0?
        #(3) Length: Since hypergradient optimization procedures do not necessarily use a proper line search, it may also be important for the approximate hypergradient to have a length comparable to the true hypergradient.
        
        
        exact_metagrad2['log_L2_reg']      = [sum(exact_metagrad['log_L2_reg'][0:7840]), sum(exact_metagrad['log_L2_reg'][7840:7850])]
        exact_metagrad2['log_param_scale'] = [sum(exact_metagrad['log_param_scale'][0:7840]), sum(exact_metagrad['log_param_scale'][7840:7850])]
        exact_metagrad2['log_alphas']      = exact_metagrad['log_alphas']
        exact_metagrad2['invlogit_betas']  = exact_metagrad['invlogit_betas']
    
        meta_results['exact_meta_grad_magnitude'].append(np.linalg.norm(exact_metagrad2.vect))
        meta_results['DrMAD_exact_angle'].append(np.dot(exact_metagrad2.vect, metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(exact_metagrad2.vect)))
    
        #TODO: do the above for parameters separately? E.g. check log_alphas separately
                                                  
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])  #Michael: train->tests
#    final_result = adam(hyperloss_grad, hyperparams.vect,
#                            meta_callback, N_meta_iter, meta_alpha)
    final_result = adam(hyperloss_grad, hyperparams.vect, exact_metagrad,
                            meta_callback, N_meta_iter, meta_alpha)
    #write modified adam to ignore exact hypergrad in sgd4_mad_with_exact
    #meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Beispiel #14
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad(hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None  # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad
Beispiel #15
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    rs = RandomState((seed))
    init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale
    one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :],
                                    dtype=int)
    fake_labels = one_hot(np.array(range(N_train)) % N_classes,
                          N_classes)  # One of each.

    hyperparams = VectorParser()
    hyperparams['fake_data'] = init_fake_data
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_weight_types,
                                                   init_log_param_scale)
    fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                              init_log_alphas)
    fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                                  init_invlogit_betas)
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    loss_meta_parser = VectorParser()
    loss_meta_parser['']

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, meta),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results[
            'weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
        meta_results['train_loss'].append(0)
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            print metagrad
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser