def run(superparams): alpha, log_scale_init, offset_init_std = superparams RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 30 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) record_results(hyperparams_0.vect, 0, None) return [results['train_loss'][0], results['valid_loss'][0]]
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter + 1) print "iteration", i_iter cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data)) learning_curve_dict["grad_norm"].append(np.linalg.norm(g)) learning_curve_dict["weight_norm"].append(np.linalg.norm(x)) learning_curve_dict["velocity_norm"].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams["log_alphas"]) betas = logit(cur_hyperparams["invlogit_betas"]) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) # callback(W_opt, N_iters) return W_opt, learning_curve_dict
def primal_optimizer(hyperparams_vect, meta_epoch): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, meta_epoch, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) cur_hyperparams = hyperparams.new_vect(hyperparams_vect) rs = RandomState((seed, meta_epoch)) # Randomly initialize weights W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) # Init regularization term L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) # Set step sizes alphas = np.exp(cur_hyperparams['log_alphas']) # Momentum terms betas = logit(cur_hyperparams['invlogit_betas']) # Train model W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser) cur_primal_results['weights'] = getval(W_opt).copy() return W_opt
def run(superparams): alpha, log_scale_init, offset_init_std = superparams RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 30 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) record_results(hyperparams_0.vect, 0, None) return [results['train_loss'][0], results['valid_loss'][0]]
def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = RS.randn(N_weights) * initialization_scale W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad) w_vect_final = transform_weights(z_vect_final, transform) #TODO: print/store losses and error rates here print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data))) print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data))) print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data))) print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data))) print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data))) print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data))) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) #TODO: initial scale AND regularization train_loss = getval(loss_fun(w_vect_final, **cur_train_data)) print "Training loss (unregularized) = " +str(train_loss) all_train_loss.append(train_loss) valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data)) print "Validation loss = " +str(valid_loss) all_valid_loss.append(valid_loss) tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data)) print "Test loss = " +str(tests_loss) all_tests_loss.append(tests_loss) plt.plot(all_train_loss, label="training loss (unregularized)") plt.plot(all_valid_loss, label="validation loss") plt.plot(all_tests_loss, label="test loss") plt.title("loss vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("loss") plt.legend() plt.savefig("loss2000_corrected.png") plt.clf() train_rate = getval(frac_err(w_vect_final, **cur_train_data)) print "Training error rate = " +str(train_rate) all_train_rates.append(train_rate) valid_rate = getval(frac_err(w_vect_final, **cur_valid_data)) print "Validation error rate = " +str(valid_rate) all_valid_rates.append(valid_rate) tests_rate = getval(frac_err(w_vect_final, **cur_tests_data)) print "Test error rate = " +str(tests_rate) all_tests_rates.append(tests_rate) plt.plot(all_train_rates, label="training error rate") plt.plot(all_valid_rates, label="validation error rate") plt.plot(all_tests_rates, label="test error rate") plt.title("error rate vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("error rate") plt.legend() plt.savefig("error2000_corrected.png") plt.clf() return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss_exact(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, meta_it=0): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z_exact(cur_train_data, z_vect_0, transform, meta_iteration=meta_it) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = RS.randn(N_weights) * initialization_scale W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)
def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results): RS = RandomState((seed, i_hyper, i_primal, i_script)) w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg) ) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results["valid_loss"].append(getval(valid_loss) / N_scripts) results["train_loss"].append(total_loss(w_vect_final, train_data) / N_scripts) return valid_loss
def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results['valid_loss'].append(getval(valid_loss) / N_scripts) results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts) results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts) return valid_loss
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad(hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad
def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data)
def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) # fraction_error = frac_err(w_vect_final,**cur_valid_data) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data)
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results[ 'weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append( np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser