def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): seed = i_hyper * 10**6 + i_iter idxs = npr.RandomState(seed).randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper, g): print "Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) #fixed_hyperparams = VectorParser() #fixed_hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) # return loss_fun(W_opt, **valid_data) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad( hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) def build_hypervect(init_log_alphas, init_invlogit_betas, init_log_param_scale): hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) return hyperparams hyperparams = build_hypervect( init_log_alphas, init_invlogit_betas, init_log_param_scale) # Build just for parser. fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def whetlab_optimize(loss, max_iters, callback): for i in xrange(max_iters): params = scientist.suggest() hyperparams = build_hypervect(**params) cur_loss = loss(hyperparams.vect, 0) # No randomness scientist.update(params, -cur_loss) if callback: callback(hyperparams.vect, 0) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) whetlab_optimize(hyperloss, N_meta_iter, meta_callback) best_params = scientist.best() print "best params:", best_params parser.vect = None # No need to pickle zeros return meta_results, parser, best_params
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter + 1) print "iteration", i_iter cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) meta_callback(hyperparams.vect, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == 0 or i_iter == N_iters - 1: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Now do a line search along that direction. log_stepsizes = np.linspace(-init_log_alphas, init_log_alphas*4, N_points_in_line_search) for log_stepsizes in log_stepsizes: hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), log_stepsizes) meta_callback(hyperparams.vect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, log_stepsizes
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams["log_param_scale"] = np.full(N_weight_types, init_log_param_scale) hyperparams["log_alphas"] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams["invlogit_betas"] = np.full((N_iters, N_weight_types), init_invlogit_betas) for name in parser.names: hyperparams[("rescale", name)] = np.full(N_iters, init_rescales) fixed_hyperparams = VectorParser() fixed_hyperparams["log_L2_reg"] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data)) learning_curve_dict["grad_norm"].append(np.linalg.norm(g)) learning_curve_dict["weight_norm"].append(np.linalg.norm(x)) learning_curve_dict["velocity_norm"].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams["log_alphas"]) betas = logit(cur_hyperparams["invlogit_betas"]) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results["train_loss"].append(loss_fun(x, **train_data)) meta_results["valid_loss"].append(loss_fun(x, **valid_data)) meta_results["tests_loss"].append(loss_fun(x, **tests_data)) meta_results["test_err"].append(frac_err(x, **tests_data)) meta_results["learning_curves"].append(learning_curve_dict) if metagrad is not None: meta_results["meta_grad_magnitude"].append(np.linalg.norm(metagrad)) meta_results["meta_grad_angle"].append( np.dot(old_metagrad[0], metagrad) / (np.linalg.norm(metagrad) * np.linalg.norm(old_metagrad[0])) ) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results["train_loss"][-1], meta_results["valid_loss"][-1], meta_results["train_loss"][-1], meta_results["test_err"][-1], ) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) init_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(init_hyperparams['log_alphas']) betas = logit(init_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['train_loss'][-1], meta_results['test_err'][-1]) # Average many gradient evaluations at the initial point. hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size)) for i in xrange(N_gradients_in_average): hypergrads[i] = hyperloss_grad(hyperparams.vect, i) print i first_gradient = hypergrads[0] avg_gradient = np.mean(hypergrads, axis=0) # Now do a line search along that direction. parsed_avg_grad = hyperparams.new_vect(avg_gradient) stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel())) stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search) for i, stepsize in enumerate(stepsizes): cur_hypervect = hyperparams.vect - stepsize * avg_gradient meta_callback(cur_hypervect, 0) # Use the same random seed every time. parser.vect = None # No need to pickle zeros return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
meta_alpha = 0.04 N_meta_iter = 50 seed = 0 # -- # Helpers def fill_parser(parser, items): partial_vects = [np.full(parser[name].size, items[i]) for i, name in enumerate(parser.names)] return np.concatenate(partial_vects, axis=0) # -- # Run train_data, valid_data, test_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {}
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) #only uses two different regularization hyperparameters, one for each layer? N_weight_types = len(parser.names) # = 2 print(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) #don't update scale #TODO: remove scale from gradient, then? exact_metagrad = VectorParser() exact_metagrad['log_L2_reg'] = fill_parser(parser, hyperparams['log_L2_reg']) #np.zeros(N_weight_types) exact_metagrad['log_param_scale'] = fill_parser(parser, fixed_hyperparams['log_param_scale']) #np.zeros(N_weight_types) exact_metagrad['log_alphas'] = np.zeros(N_iters) exact_metagrad['invlogit_betas'] = np.zeros(N_iters) exact_metagrad2 = VectorParser() exact_metagrad2['log_L2_reg'] = np.zeros(N_weight_types) exact_metagrad2['log_param_scale'] = np.zeros(N_weight_types) exact_metagrad2['log_alphas'] = np.zeros(N_iters) exact_metagrad2['invlogit_betas'] = np.zeros(N_iters) #exact_metagrad = exact_metagradV.vect #print(hyperparams.vect) #exact_metagrad = [np.zeros(N_weight_types), np.zeros(N_weight_types), np.zeros(N_iters), np.zeros(N_iters)] #initialize # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # N_batches=10 times learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) # TODO: why doesn't the following line work with N_iter=1? W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) # TODO: Put on proper scale; no SGD on log/invlogit scale alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) # TODO: check this L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback) #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) # TODO: This is where the chain rule happens, dhyperloss/dW_opt x dW_opt/dhyperparam_vect; first term is SGD meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] #def meta_callback(hyperparam_vect, i_hyper, metagrad): def meta_callback(hyperparam_vect, i_hyper, metagrad, exact_metagrad=exact_metagrad): x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) # these are the unregularized losses below; default sets L2_reg=0.0 meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['train_err'].append(frac_err(x, **train_data)) meta_results['valid_err'].append(frac_err(x, **valid_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) print("metagrad", len(metagrad)) meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) #Michael: added comparisons with exact metagrad here #(2) Angle condition: More strongly, is the cosine of the angle between the two strictly bounded away from 0? #(3) Length: Since hypergradient optimization procedures do not necessarily use a proper line search, it may also be important for the approximate hypergradient to have a length comparable to the true hypergradient. exact_metagrad2['log_L2_reg'] = [sum(exact_metagrad['log_L2_reg'][0:7840]), sum(exact_metagrad['log_L2_reg'][7840:7850])] exact_metagrad2['log_param_scale'] = [sum(exact_metagrad['log_param_scale'][0:7840]), sum(exact_metagrad['log_param_scale'][7840:7850])] exact_metagrad2['log_alphas'] = exact_metagrad['log_alphas'] exact_metagrad2['invlogit_betas'] = exact_metagrad['invlogit_betas'] meta_results['exact_meta_grad_magnitude'].append(np.linalg.norm(exact_metagrad2.vect)) meta_results['DrMAD_exact_angle'].append(np.dot(exact_metagrad2.vect, metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(exact_metagrad2.vect))) #TODO: do the above for parameters separately? E.g. check log_alphas separately old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) #Michael: train->tests # final_result = adam(hyperloss_grad, hyperparams.vect, # meta_callback, N_meta_iter, meta_alpha) final_result = adam(hyperloss_grad, hyperparams.vect, exact_metagrad, meta_callback, N_meta_iter, meta_alpha) #write modified adam to ignore exact hypergrad in sgd4_mad_with_exact #meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) avg_hypergrad = initial_hypergrad.copy() for i in xrange(1, N_meta_iter): avg_hypergrad += hyperloss_grad(hyperparams.vect, i) print i parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad) parser.vect = None # No need to pickle zeros return parser, parsed_init_hypergrad, parsed_avg_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser