def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict
def primal_optimizer(hyperparams_vect, meta_epoch): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, meta_epoch, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) cur_hyperparams = hyperparams.new_vect(hyperparams_vect) rs = RandomState((seed, meta_epoch)) # Randomly initialize weights W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) # Init regularization term L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) # Set step sizes alphas = np.exp(cur_hyperparams['log_alphas']) # Momentum terms betas = logit(cur_hyperparams['invlogit_betas']) # Train model W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser) cur_primal_results['weights'] = getval(W_opt).copy() return W_opt
def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data)) learning_curve_dict["grad_norm"].append(np.linalg.norm(g)) learning_curve_dict["weight_norm"].append(np.linalg.norm(x)) learning_curve_dict["velocity_norm"].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams["log_alphas"]) betas = logit(cur_hyperparams["invlogit_betas"]) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) # callback(W_opt, N_iters) return W_opt, learning_curve_dict
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict
def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = RS.randn(N_weights) * initialization_scale W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)
def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) learning_curve_dict['iteration'].append(i_iter + 1) print "iteration", i_iter cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) return W_opt, learning_curve_dict
def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState( (seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)
def hyperloss2(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z2(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data)
def run(superparams): alpha, log_scale_init, offset_init_std = superparams RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 30 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) record_results(hyperparams_0.vect, 0, None) return [results['train_loss'][0], results['valid_loss'][0]]
def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean([ hyperloss(hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval) ]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype( np.float16)) results['train_loss'].append( loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append( loss_fun(train_alphabets, report_train_loss=False))
def record_results(hyperparam_vect, i_hyper, g): # print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) new_seed = RS.int32() def loss_fun(alphabets, report_train_loss): return np.mean( [ hyperloss( hyperparam_vect, new_seed, alphabets=alphabets, verbose=False, report_train_loss=report_train_loss, ) for i in range(N_alphabets_eval) ] ) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results["train_loss"].append(loss_fun(train_alphabets, report_train_loss=True)) results["valid_loss"].append(loss_fun(train_alphabets, report_train_loss=False))
def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) return loss + reg
def primal_loss(w_vect, reg, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) loss = loss_fun(w_vect, **minibatch) reg = regularization(w_vect, reg) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg
def loss_fun(alphabets, report_train_loss): RS = RandomState( (seed, "evaluation")) # Same alphabet with i_hyper now return np.mean([ hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval) ])
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad) w_vect_final = transform_weights(z_vect_final, transform) #TODO: print/store losses and error rates here print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data))) print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data))) print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data))) print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data))) print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data))) print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data))) return loss_fun(w_vect_final, **cur_valid_data)
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) #TODO: initial scale AND regularization train_loss = getval(loss_fun(w_vect_final, **cur_train_data)) print "Training loss (unregularized) = " +str(train_loss) all_train_loss.append(train_loss) valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data)) print "Validation loss = " +str(valid_loss) all_valid_loss.append(valid_loss) tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data)) print "Test loss = " +str(tests_loss) all_tests_loss.append(tests_loss) plt.plot(all_train_loss, label="training loss (unregularized)") plt.plot(all_valid_loss, label="validation loss") plt.plot(all_tests_loss, label="test loss") plt.title("loss vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("loss") plt.legend() plt.savefig("loss2000_corrected.png") plt.clf() train_rate = getval(frac_err(w_vect_final, **cur_train_data)) print "Training error rate = " +str(train_rate) all_train_rates.append(train_rate) valid_rate = getval(frac_err(w_vect_final, **cur_valid_data)) print "Validation error rate = " +str(valid_rate) all_valid_rates.append(valid_rate) tests_rate = getval(frac_err(w_vect_final, **cur_tests_data)) print "Test error rate = " +str(tests_rate) all_tests_rates.append(tests_rate) plt.plot(all_train_rates, label="training error rate") plt.plot(all_valid_rates, label="validation error rate") plt.plot(all_tests_rates, label="test error rate") plt.title("error rate vs meta iteration") plt.xlabel("meta iteration") plt.ylabel("error rate") plt.legend() plt.savefig("error2000_corrected.png") plt.clf() return loss_fun(w_vect_final, **cur_valid_data)
def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) #TODO: this is a scale transformation, not regularization! loss = loss_fun(w_vect, **minibatch) #use new scale for prediction reg = regularization(z_vect) #regularize original scale #TODO: should be equivalent: w = z*e^transform, so # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2 # see process_transform #if record_results and i_primal % N_thin == 0: #print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg
def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script): RS = RandomState((seed, i_hyper, i_primal, i_script)) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch) reg = regularization(z_vect) if i_script == 0 else 0.0 if i_primal % N_thin == 0 and i_script == 0: print "Iter {0}, full losses: train: {1}, valid: {2}, reg: {3}".format( i_primal, total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect)), getval(reg) / N_scripts_per_iter) return loss + reg
def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_regs.append(cur_reg.copy()) print "Hyper iter {0}, test loss {1}".format( i_hyper, all_tests_loss[-1]) print "Cur_reg", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) print constrained_grad # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha cur_reg -= constrained_grad * meta_alpha return cur_reg
def train_reg(reg_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data) cur_reg = reg_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) constrained_grad = constrain_reg(raw_grad, constraint) cur_reg -= np.sign(constrained_grad) * meta_alpha return cur_reg
def plot(): import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['font.family'] = 'serif' mpl.rcParams['image.interpolation'] = 'none' with open('results.pkl') as f: transform_parser, transform_vects, train_losses, tests_losses = pickle.load(f) RS = RandomState((seed, "plotting")) fig = plt.figure(0) fig.clf() ax = fig.add_subplot(111) omniglot.show_alphabets(omniglot.load_rotated_alphabets(RS, normalize=False, angle=90), ax=ax) ax.plot([0, 20 * 28], [5 * 28, 5 * 28], '--k') ax.text(-15, 5 * 28 * 3 / 2 - 60, "Rotated alphabets", rotation='vertical') plt.savefig("all_alphabets.png") # Plotting transformations names = ['no_sharing', 'full_sharing', 'learned_sharing'] title_strings = {'no_sharing' : 'Independent nets', 'full_sharing' : 'Shared bottom layer', 'learned_sharing' : 'Learned sharing'} covar_imgs = {name : build_covar_image(transform_vects[name]) for name in names} for i, name in enumerate(names): fig = plt.figure(0) fig.clf() fig.set_size_inches((2, 6)) ax = fig.add_subplot(111) ax.matshow(covar_imgs[name], cmap = mpl.cm.binary) ax.set_xticks([]) ax.set_yticks([]) plt.savefig('learned_corr_{0}.png'.format(i)) plt.savefig('learned_corr_{0}.pdf'.format(i))
def primal_stochastic_loss(z_vect, transform_vect, i_primal): RS = RandomState((seed, i_hyper, i_primal)) loss = 0.0 for _ in range(N_scripts_per_iter): i_script = RS.randint(N_scripts) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch) reg = regularization(z_vect) if i_primal % 1 == 0: print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg)) print "Full losses: train: {0}, valid: {1}".format( total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) return loss + reg
def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format( i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform
def primal_stochastic_loss(z_vect, transform_vect, i_primal): RS = RandomState((seed, i_hyper, i_primal)) loss = 0.0 for _ in range(N_scripts_per_iter): i_script = RS.randint(N_scripts) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch) reg = regularization(z_vect) if i_primal % 20 == 0: print "Iter {0}, loss {1}, reg {2}".format(i_primal, getval(loss), getval(reg)) print "Full losses: train: {0}, valid: {1}".format( total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) return loss + reg
def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad) w_vect_final = transform_weights(z_vect_final, transform) #TODO: print/store losses and error rates here print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data))) print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data))) print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data))) print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data))) print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data))) print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data))) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) #No chain rule here '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path? w_vect_final = transform_weights(z_vect_final, transform) return frac_err(w_vect_final, **cur_valid_data)''' cur_reg = reg_0 for i_hyper in range(N_meta_iter): print "Hyper iter "+ str(i_hyper) """if i_hyper % N_meta_thin == 0: test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data) all_tests_rates.append(test_rate) all_transforms.append(cur_reg.copy()) all_avg_regs.append(np.mean(cur_reg)) print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1]) print "Cur_transform", np.mean(cur_reg)""" RS = RandomState((seed, i_top, i_hyper, "hyperloss")) #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split) cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad) #print "before constraining grad" constrained_grad = constrain_reg(raw_grad, constraint) # TODO: can put exact hypergradient here, using constraint #print "after constraining grad, before constraining exact" # TODO: DrMAD norm matches after constraining, but not exact norm?? Why??? # This one is about 4x larger than constrained one print np.linalg.norm(raw_grad) print np.linalg.norm(exact_metagrad[0]) constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint) #print "after constraining exact" # TODO: compute statistics # TODO: sometimes negative??? print("cosine of angle between DrMAD and exact = " +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad)))) print("cosine of angle between signs of DrMAD and exact = " +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad))) print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad))) print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad))) cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient... #TODO: momentum return cur_reg
def hyperloss(transform_vect, i_hyper): RS = RandomState((seed, i_hyper, "hyperloss")) cur_train_data, cur_valid_data = random_partition( train_data, RS, [10, 2]) z_vect_final = train_z(cur_train_data, transform_vect, RS) w_vect_final = transform_weights(z_vect_final, transform_vect) return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts
def plot(): import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['font.family'] = 'serif' mpl.rcParams['image.interpolation'] = 'none' with open('results.pkl') as f: transform_parser, transform_vects, train_losses, tests_losses = pickle.load( f) RS = RandomState((seed, "plotting")) fig = plt.figure(0) fig.clf() ax = fig.add_subplot(111) alphabets = omniglot.load_rotated_alphabets(RS, normalize=False, angle=90) num_cols = 15 num_rows = 5 omniglot.show_alphabets(alphabets, ax=ax, n_cols=num_cols) ax.plot([0, num_cols * 28], [num_rows * 28, num_rows * 28], '--k') #ax.text(-15, 5 * 28 * 3 / 2 - 60, "Rotated alphabets", rotation='vertical') plt.savefig("all_alphabets.png", bbox_inches='tight') # Plotting transformations names = ['no_sharing', 'full_sharing', 'learned_sharing'] title_strings = { 'no_sharing': 'Independent nets', 'full_sharing': 'Shared bottom layer', 'learned_sharing': 'Learned sharing' } covar_imgs = { name: build_covar_image(transform_vects[name]) for name in names } for model_ix, model_name in enumerate(names): image_list = covar_imgs[model_name] for layer_ix, image in enumerate(image_list): fig = plt.figure(0) fig.clf() fig.set_size_inches((1, 1)) ax = fig.add_subplot(111) ax.matshow(image, cmap=mpl.cm.binary, vmin=0.0, vmax=1.0) ax.set_xticks([]) ax.set_yticks([]) plt.savefig('minifigs/learned_corr_{0}_{1}.png'.format( model_name, layer_ix), bbox_inches='tight') plt.savefig('minifigs/learned_corr_{0}_{1}.pdf'.format( model_name, layer_ix), bbox_inches='tight') # Write results to a nice latex table for paper. with open('results_table.tex', 'w') as f: f.write(" & No Sharing & Full Sharing & Learned \\\\\n") f.write("Training loss & {:2.2f} & {:2.2f} & {:2.2f} \\\\\n".format( train_losses['no_sharing'], train_losses['full_sharing'], train_losses['learned_sharing'])) f.write("Test loss & {:2.2f} & {:2.2f} & \\bf {:2.2f} ".format( tests_losses['no_sharing'], tests_losses['full_sharing'], tests_losses['learned_sharing']))
def primal_loss(z_vect, transform_vect, i_primal, record_results): RS = RandomState((seed, i_hyper, i_primal, i_script)) w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)) return loss + reg
def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results): RS = RandomState((seed, i_hyper, i_primal, i_script)) w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg) ) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results["valid_loss"].append(getval(valid_loss) / N_scripts) results["train_loss"].append(total_loss(w_vect_final, train_data) / N_scripts) return valid_loss
def show_alphabets(alphabets, ax=None, n_cols=20): import matplotlib as mpl import matplotlib.pyplot as plt from nn_utils import plot_images seed = 1 n_rows = len(alphabets) full_image = np.zeros((0, n_cols * 28)) for alphabet in alphabets: RS = RandomState(seed) char_idxs = RS.randint(alphabet['X'].shape[0], size=n_cols) char_ids = np.argmax(alphabet['T'][char_idxs], axis=1) image = alphabet['X'][char_idxs].reshape((n_cols, 28, 28)) image = np.transpose(image, axes=[1, 0, 2]).reshape((28, n_cols * 28)) full_image = np.concatenate((full_image, image)) if ax is None: fig = plt.figure() fig.set_size_inches((8, 8 * n_rows/n_cols)) ax = fig.add_subplot(111) ax.imshow(full_image, cmap=mpl.cm.binary) ax.set_xticks(np.array([])) ax.set_yticks(np.array([])) plt.tight_layout() plt.savefig("all_alphabets.png")
def hyperloss(transform_vect, i_hyper, record_results=True): RS = RandomState((seed, i_hyper, "hyperloss")) def primal_loss(z_vect, transform_vect, i_primal, record_results=False): w_vect = transform_weights(z_vect, transform_vect) loss = total_loss(w_vect, train_data) reg = regularization(z_vect) if VERBOSE and record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format( i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)) return loss + reg z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) w_vect_final = transform_weights(z_vect_final, transform_vect) valid_loss = total_loss(w_vect_final, valid_data) if record_results: results['valid_loss'].append(getval(valid_loss) / N_scripts) results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts) results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts) return valid_loss
def plot(): import matplotlib.pyplot as plt import matplotlib as mpl mpl.rcParams['font.family'] = 'serif' with open('results.pkl') as f: transform_parser, transform_vects, train_losses, tests_losses = pickle.load(f) RS = RandomState((seed, "top_rs")) omniglot.show_alphabets(omniglot.load_flipped_alphabets(RS, normalize=False)) # Plotting transformations names = ['no_sharing', 'full_sharing', 'learned_sharing'] title_strings = {'no_sharing' : 'Independent\nnets', 'full_sharing' : 'Shared\nbottom layer', 'learned_sharing' : 'Learned\nsharing'} covar_imgs = {name : build_covar_image(transform_vects[name]) for name in names} prop={'family':'serif', 'size':'12'} fig = plt.figure(0) fig.clf() fig.set_size_inches((4,4)) for i, name in enumerate(names): ax = fig.add_subplot(1, 3, i + 1) ax.matshow(covar_imgs[name], cmap = mpl.cm.binary) ax.set_title(title_strings[name]) ax.set_xticks([]) ax.set_yticks([]) if i == 0: labels = ["Layer {0}".format(layer) for layer in [3, 2, 1]] ypos = [5, 15, 25] for s, y in zip(labels, ypos): ax.text(-3, y, s, rotation='vertical') plt.tight_layout() plt.savefig('learned_corr.png') plt.savefig('learned_corr.pdf')
def run(): RS = RandomState((seed, "top_rs")) all_data = mnist.load_data_as_dict() train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests]) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size def transform_weights(z_vect, transform): return z_vect * np.exp(transform) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2) def constrain_transform(t_vect, name): all_t = w_parser.new_vect(t_vect) for i in range(N_layers): all_t[('biases', i)] = 0.0 if name == 'universal': t_mean = np.mean([np.mean(all_t[('weights', i)]) for i in range(N_layers)]) for i in range(N_layers): all_t[('weights', i)] = t_mean elif name == 'layers': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)]) elif name == 'units': for i in range(N_layers): all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True) else: raise Exception return all_t.vect def process_transform(t_vect): # Remove the redundancy due to sharing transformations within units all_t = w_parser.new_vect(t_vect) new_t = np.zeros((0,)) for i in range(N_layers): layer = all_t[('weights', i)] assert np.all(layer[:, 0] == layer[:, 1]) cur_t = log_L2 - 2 * layer[:, 0] new_t = np.concatenate((new_t, cur_t)) return new_t def train_z(data, z_vect_0, transform): N_data = data['X'].shape[0] def primal_loss(z_vect, transform, i_primal, record_results=False): RS = RandomState((seed, i_primal, "primal")) idxs = RS.randint(N_data, size=batch_size) minibatch = dictslice(data, idxs) w_vect = transform_weights(z_vect, transform) loss = loss_fun(w_vect, **minibatch) reg = regularization(z_vect) if record_results and i_primal % N_thin == 0: print "Iter {0}: train: {1}".format(i_primal, getval(loss)) return loss + reg return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters) all_transforms, all_tests_loss = [], [] def train_reg(transform_0, constraint, N_meta_iter, i_top): def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data) hypergrad = grad(hyperloss) cur_transform = transform_0 for i_hyper in range(N_meta_iter): if i_hyper % N_meta_thin == 0: tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data) all_tests_loss.append(tests_loss) all_transforms.append(cur_transform.copy()) print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1]) RS = RandomState((seed, i_top, i_hyper, "hyperloss")) cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) raw_grad = hypergrad(cur_transform, i_hyper, *cur_split) constrained_grad = constrain_transform(raw_grad, constraint) cur_transform -= constrained_grad * meta_alpha return cur_transform transform = np.zeros(N_weights) constraints = ['universal', 'layers', 'units'] for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)): print "Top level iter {0}".format(i_top) transform = train_reg(transform, constraint, N_meta_iter, i_top) all_L2_regs = np.array(zip(*map(process_transform, all_transforms))) return all_L2_regs, all_tests_loss
def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)
def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale) z_vect_final = train_z(cur_train_data, z_vect_0, transform) w_vect_final = transform_weights(z_vect_final, transform) return loss_fun(w_vect_final, **cur_valid_data)
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) N_weights = len(parser.vect) hyperparams = VectorParser() rs = RandomState((seed)) hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\ + rs.randn(N_weights) * init_log_L2_reg_noise hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) cur_primal_results = {} def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = RandomState((seed, i_hyper, i_iter)) # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = np.exp(cur_hyperparams['log_L2_reg']) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field] = cur_hyperparams[field] meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) initial_hypergrad = hyperloss_grad( hyperparams.vect, 0) parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy()) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser, parsed_init_hypergrad
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) rs = RandomState((seed)) init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes) # One of each. hyperparams = VectorParser() hyperparams['fake_data'] = init_fake_data fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types), init_log_alphas) fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types), init_invlogit_betas) fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) cur_primal_results = {} loss_meta_parser = VectorParser() loss_meta_parser[''] def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, meta_vect, i_iter): (train_data, train_labels, L2_vect) = meta return loss_fun(w, train_data, train_labels, L2_vect) #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel())) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % thin == 0: # learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) fake_data = cur_hyperparams['fake_data'] rs = RandomState((seed, i_hyper)) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= rs.randn(W0.size) alphas = np.exp(fixed_hyperparams['log_alphas']) betas = logit(fixed_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg'])) meta = kylist(fake_data, fake_labels, L2_reg) W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta), parser, callback=callback) cur_primal_results['weights'] = getval(W_opt).copy() cur_primal_results['learning_curve'] = getval(learning_curve_dict) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) old_metagrad = [np.ones(hyperparams.vect.size)] def meta_callback(hyperparam_vect, i_hyper, metagrad=None): x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve'] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels)) meta_results['train_loss'].append(0) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['test_err'].append(frac_err(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) meta_results['example_weights'] = x if metagrad is not None: print metagrad meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad)) meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \ / (np.linalg.norm(metagrad)* np.linalg.norm(old_metagrad[0]))) old_metagrad[0] = metagrad print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \ " Test Loss {3:2.4f} Test Err {4:2.4f}".format( i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1], meta_results['tests_loss'][-1], meta_results['test_err'][-1]) final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(cur_train_data, w_vect_0, reg) return loss_fun(w_vect_final, **cur_valid_data)
def loss_fun(alphabets, report_train_loss): RS = RandomState((seed, "evaluation")) # Same alphabet with i_hyper now return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)])
def run(script_corr): """Three different parsers: w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single script script_parser[i_script] : weights vector for each script transform_parser[i_layer] : transform matrix (scripts x scripts) for each alphabet""" RS = RandomState((seed, "top_rs")) train_data, valid_data, tests_data = omniglot.load_data_split( [11, 2, 2], RS, num_alphabets=N_scripts) w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size uncorrelated_mat = np.eye(N_scripts) fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts) transform_mat = (1 - script_corr ) * uncorrelated_mat + script_corr * fully_correlated_mat transform_mat = transform_mat transform_parser = VectorParser() for i_layer in range(N_layers): if i_layer == N_layers - 1: transform_parser[i_layer] = uncorrelated_mat else: transform_parser[i_layer] = transform_mat script_parser = VectorParser() for i_script in range(N_scripts): script_parser[i_script] = np.zeros(N_weights) def transform_weights(all_z_vect, transform_vect, i_script_out): all_z = script_parser.new_vect(all_z_vect) transform = transform_parser.new_vect(transform_vect) W = OrderedDict( ) # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported for k in w_parser.idxs_and_shapes.keys(): W[k] = 0.0 for i_layer in range(N_layers): script_weightings = transform[i_layer][i_script_out, :] for i_script in range(N_scripts): z_i_script = w_parser.new_vect(all_z[i_script]) script_weighting = script_weightings[i_script] W[('biases', i_layer)] += z_i_script[('biases', i_layer)] * script_weighting W[('weights', i_layer)] += z_i_script[('weights', i_layer)] * script_weighting return np.concatenate([v.ravel() for v in W.values()]) def loss_from_latents(z_vect, transform_vect, i_script, data): w_vect = transform_weights(z_vect, transform_vect, i_script) return loss_fun(w_vect, **data) def regularization(z_vect): return np.dot(z_vect, z_vect) * np.exp(log_L2_init) results = defaultdict(list) def hyperloss(transform_vect, i_hyper, record_results=False): def primal_stochastic_loss(z_vect, transform_vect, i_primal): RS = RandomState((seed, i_hyper, i_primal)) loss = 0.0 for _ in range(N_scripts_per_iter): i_script = RS.randint(N_scripts) N_train = train_data[i_script]['X'].shape[0] idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data[i_script], idxs) loss += loss_from_latents(z_vect, transform_vect, i_script, minibatch) reg = regularization(z_vect) if i_primal % 20 == 0: print "Iter {0}, loss {1}, reg {2}".format( i_primal, getval(loss), getval(reg)) print "Full losses: train: {0}, valid: {1}".format( total_loss(train_data, getval(z_vect)), total_loss(valid_data, getval(z_vect))) return loss + reg def total_loss(data, z_vect): return np.mean([ loss_from_latents(z_vect, transform_vect, i_script, data[i_script]) for i_script in range(N_scripts) ]) z_vect_0 = RS.randn( script_parser.vect.size) * np.exp(log_initialization_scale) z_vect_final = sgd(grad(primal_stochastic_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None) valid_loss = total_loss(valid_data, z_vect_final) if record_results: results['valid_loss'].append(valid_loss) results['train_loss'].append(total_loss(train_data, z_vect_final)) # results['tests_loss'].append(total_loss(tests_data, z_vect_final)) return valid_loss hyperloss(transform_parser.vect, 0, record_results=True) return results['train_loss'][-1], results['valid_loss'][-1]
def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data): RS = RandomState((seed, i_top, i_hyper, "hyperloss")) w_vect_0 = RS.randn(N_weights) * init_scales w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg) # fraction_error = frac_err(w_vect_final,**cur_valid_data) return loss_fun(w_vect_final, **cur_valid_data)