def test_rms_prop(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) (loss_fun, true_argmin) = make_optimization_problem(N_weights) x_min = rms_prop(grad(loss_fun), W0) assert np.allclose(x_min, true_argmin, rtol=1e-3, atol=1e-4), \ "Diffs are: {0}".format(x_min - true_argmin)
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): seed = i_hyper * 10**6 + i_iter idxs = npr.RandomState(seed).randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper, g): print "Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) #fixed_hyperparams = VectorParser() #fixed_hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve = [] def callback(x, i_iter): if i_iter % N_batches == 0: learning_curve.append(loss_fun(x, **train_data)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) V0 = np.zeros(W0.size) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) # return loss_fun(W_opt, **valid_data) return loss_fun(W_opt, **train_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): N_iters = N_epochs parser, loss_fun = make_toy_funs() N_weight_types = len(parser.names) N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) all_learning_curves = [] all_param_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] params_curve = [] def callback(x, i): params_curve.append(x) learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = np.ones(N_weights) * init_param_scale V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 results = sgd3(indexed_loss_fun, loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['V0'] = results['dMd_v'] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) all_param_curves.append(params_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['iter_num'].append(i) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves meta_results['all_param_curves'] = all_param_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): parser, loss_fun = make_parabola(dimension) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def primal_optimizer(hyperparam_vect, i_hyper): learning_curve = [] def callback(x, i_iter): learning_curve.append(loss_fun(x)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale'])) W0 *= npr.RandomState(hash(i_hyper)).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(loss_fun), kylist(W0, alphas, betas, L2_reg), callback) callback(W_opt, N_iters) return W_opt, learning_curve def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['learning_curves'].append(learning_curve) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): train_data, valid_data, tests_data = load_data_dicts( N_train, N_valid, N_tests) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) fixed_hyperparams = VectorParser() fixed_hyperparams['log_param_scale'] = np.full(N_iters, init_log_param_scale) # TODO: memoize def primal_optimizer(hyperparam_vect, i_hyper): def indexed_loss_fun(w, L2_vect, i_iter): rs = npr.RandomState( npr.RandomState(global_seed + i_hyper).randint(1000)) seed = i_hyper * 10**6 + i_iter # Deterministic seed needed for backwards pass. idxs = rs.randint(N_train, size=batch_size) return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect) learning_curve_dict = defaultdict(list) def callback(x, v, g, i_iter): if i_iter % N_batches == 0: learning_curve_dict['learning_curve'].append( loss_fun(x, **train_data)) learning_curve_dict['grad_norm'].append(np.linalg.norm(g)) learning_curve_dict['weight_norm'].append(np.linalg.norm(x)) learning_curve_dict['velocity_norm'].append(np.linalg.norm(v)) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg'])) W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback) #callback(W_opt, N_iters) return W_opt, learning_curve_dict def hyperloss(hyperparam_vect, i_hyper): W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper) return loss_fun(W_opt, **valid_data) hyperloss_grad = grad(hyperloss) meta_results = defaultdict(list) def meta_callback(hyperparam_vect, i_hyper): print "Meta Epoch {0}".format(i_hyper) x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper) cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x, **train_data)) meta_results['valid_loss'].append(loss_fun(x, **valid_data)) meta_results['tests_loss'].append(loss_fun(x, **tests_data)) meta_results['learning_curves'].append(learning_curve_dict) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, gamma=0.0) meta_callback(final_result, N_meta_iter) parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [ np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names) ] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [ np.sum(weights_grad[name]) for name in parser.names ] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = ( results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = RS.randn(N_weights) * initialization_scale W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): print "Meta iter {0}. Recording results".format(i_hyper) # RS = RandomState((seed, i_hyper, "evaluation")) def loss_fun(alphabets, report_train_loss): RS = RandomState((seed, "evaluation")) # Same alphabet with i_hyper now return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) results['tests_loss'].append(loss_fun(tests_alphabets, report_train_loss=False)) print "Train:", results['train_loss'] print "Valid:", results['valid_loss'] print "Tests:", results['tests_loss'] train_hyperloss = partial(hyperloss, alphabets=train_alphabets) rms_prop(grad(train_hyperloss), hyperparams_0.vect, record_results, N_meta_iter, meta_alpha, gamma=0) return results
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names)] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append(loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names)] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] W_opt = sgd5(grad(indexed_loss_fun), kylist(W0, alphas, betas, log_L2_reg), callback) all_x.append(getval(W_opt)) all_learning_curves.append(learning_curve) return valid_loss_fun(W_opt) hyperloss_grad = grad(hyperloss) add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append(loss_fun(x, X=train_images, T=train_labels)) def indexed_loss_fun(w, log_L2_reg, j): # idxs = batch_idxs[i % len(batch_idxs)] npr.seed(1000 * ii + j) idxs = npr.randint(N_train, size=len(batch_idxs)) partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names)] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) npr.seed(ii) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names)] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [np.sum(weights_grad[name]) for name in parser.names] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = (results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field : [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) meta_results['iter_num'].append(i) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser
def run(): RS = RandomState((seed, "top_rs")) all_alphabets = omniglot.load_data() RS.shuffle(all_alphabets) train_alphabets = all_alphabets[:-N_test_alphabets] tests_alphabets = all_alphabets[-N_test_alphabets:] w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = w_parser.vect.size hyperparams_0 = VectorParser() hyperparams_0['log_scale'] = log_scale_init * np.ones(N_weights) hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights) def reg_loss_fun(W, data, hyperparam_vect, reg_penalty): hyperparams = hyperparams_0.new_vect(hyperparam_vect) Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset'] return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False): RS = RandomState((seed, i_hyper, "hyperloss")) alphabet = shuffle_alphabet(RS.choice(alphabets), RS) N_train = alphabet['X'].shape[0] - N_valid_dpts train_data = dictslice(alphabet, slice(None, N_train)) if report_train_loss: valid_data = dictslice(alphabet, slice(None, N_valid_dpts)) else: valid_data = dictslice(alphabet, slice(N_train, None)) def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True): RS = RandomState((seed, i_hyper, i_primal)) idxs = RS.permutation(N_train)[:batch_size] minibatch = dictslice(train_data, idxs) loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty) if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss)) return loss W0 = np.zeros(N_weights) W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None) return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False) results = defaultdict(list) def record_results(hyperparam_vect, i_hyper, g): print "Meta iter {0}. Recording results".format(i_hyper) RS = RandomState((seed, i_hyper, "evaluation")) def loss_fun(alphabets, report_train_loss): return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets, verbose=False, report_train_loss=report_train_loss) for i in range(N_alphabets_eval)]) cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy()) if i_hyper % N_hyper_thin == 0: # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision for field in cur_hyperparams.names: results[field].append(cur_hyperparams[field].astype(np.float16)) results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True)) results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False)) results['tests_loss'].append(loss_fun(tests_alphabets, report_train_loss=False)) print "Train:", results['train_loss'] print "Valid:", results['valid_loss'] print "Tests:", results['tests_loss'] train_hyperloss = partial(hyperloss, alphabets=train_alphabets) rms_prop(grad(train_hyperloss), hyperparams_0.vect, record_results, N_meta_iter, meta_alpha, gamma=0) return results
def run(): N_iters = N_epochs parser, loss_fun = make_toy_funs() N_weight_types = len(parser.names) N_weights = parser.vect.size hyperparams = VectorParser() hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) hyperparams['V0'] = np.full(N_weights, init_V0) all_learning_curves = [] all_param_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, ii): learning_curve = [] params_curve = [] def callback(x, i): params_curve.append(x) learning_curve.append(loss_fun(x)) def indexed_loss_fun(w, log_L2_reg, j): return loss_fun(w) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) W0 = np.ones(N_weights) * init_param_scale V0 = cur_hyperparams['V0'] alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = 0.0 results = sgd3(indexed_loss_fun, loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['V0'] = results['dMd_v'] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = ( results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) all_param_curves.append(params_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): if i % N_meta_thin == 0: print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(loss_fun(x)) meta_results['iter_num'].append(i) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha, meta_gamma) meta_results['all_learning_curves'] = all_learning_curves meta_results['all_param_curves'] = all_param_curves parser.vect = None # No need to pickle zeros return meta_results, parser