def test_sgd(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) V0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 3 batch_idxs = BatchList(N_data, batch_size) N_iter = num_epochs * len(batch_idxs) alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs) A = npr.randn(N_data, N_weights) def loss_fun(W, idxs): sub_A = A[idxs, :] return np.dot(np.dot(W, np.dot(sub_A.T, sub_A)), W) result = sgd(loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas) d_x = result['d_x'] d_v = result['d_v'] d_alphas = result['d_alphas'] d_betas = result['d_betas'] def full_loss(W0, V0, alphas, betas): result = sgd(loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas) x_final = result['x_final'] return loss_fun(x_final, batch_idxs.all_idxs) d_an = (d_x, d_v, d_alphas, d_betas) d_num = nd(full_loss, W0, V0, alphas, betas) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) losses = [] d_losses = [] for log_alpha_0 in all_log_alpha_0: npr.seed(0) V0 = npr.randn(N_weights) * velocity_scale alpha_0 = np.exp(log_alpha_0) alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) losses.append(results['loss_final']) d_losses.append(d_log_loss(alpha_0, results['d_alphas'])) return losses, d_losses
def run(): train_data, valid_data, test_data = load_data_subset( N_train, N_valid, N_test) kernel = make_sq_exp_kernel(L0) def loss_fun(transform, train_data, valid_data): train_data = augment_data(train_data, transform) return weighted_neighbors_loss(train_data, valid_data, kernel) loss_grad = batchwise_function(grad(loss_fun)) loss_fun = batchwise_function(loss_fun) batch_idxs = BatchList(N_valid, batch_size) A = np.eye(N_pix) valid_losses = [loss_fun(A, train_data, valid_data)] test_losses = [loss_fun(A, train_data, test_data)] A += A_init_scale * npr.randn(N_pix, N_pix) for meta_iter in range(N_meta_iters): print "Iter {0} valid {1} test {2}".format(meta_iter, valid_losses[-1], test_losses[-1]) for idxs in batch_idxs: valid_batch = [x[idxs] for x in valid_data] d_A = loss_grad(A, train_data, valid_batch) A -= meta_alpha * (d_A + meta_L1 * np.sign(A)) valid_losses.append(loss_fun(A, train_data, valid_data)) test_losses.append(loss_fun(A, train_data, test_data)) return A, valid_losses, test_losses
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(1) V0 = npr.randn(N_weights) * velocity_scale W0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, np.exp(log_alphas), betas, record_learning_curve=True) learning_curve = results['learning_curve'] d_log_alphas = np.exp(log_alphas) * results['d_alphas'] output.append((learning_curve, log_alphas, d_log_alphas)) log_alphas = log_alphas - meta_alpha * step_smooth( d_log_alphas, iter_per_epoch) return output
def new_fun(A, B, C): N = C[0].shape[0] batch_idxs = BatchList(N, mem_batch_size) for i, idxs in enumerate(batch_idxs): cur_result = fun(A, B, [x[idxs] for x in C]) result = cur_result if i == 0 else result + cur_result return result / len(batch_idxs)
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) V0 = npr.randn(N_weights) * velocity_scale losses = [] d_losses = [] for N_iters in all_N_iters: alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) loss_curve = [] d_loss_curve = [] for log_param_scale in all_log_param_scale: print "log_param_scale {0}, N_iters {1}".format(log_param_scale, N_iters) npr.seed(1) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) loss_curve.append(results['loss_final']) d_loss_curve.append(d_log_loss(W0, results['d_x'])) losses.append(loss_curve) d_losses.append(d_loss_curve) with open('results.pkl', 'w') as f: pickle.dump((losses, d_losses), f)
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(2) V0 = npr.randn(N_weights) * velocity_scale #W0 = npr.randn(N_weights) * np.exp(log_param_scale) X_uniform = npr.rand( N_weights) # Weights are uniform passed through an inverse cdf. bindict = { k: np.linspace(-1, 1, N_bins) * np.exp(log_param_scale) # Different cdf per layer. for k, v in parser.idxs_and_shapes.iteritems() } output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) #X0, dX_dbins = bininvcdf(W_uniform, bins) X0 = np.zeros(N_weights) dX_dbins = {} for k, cur_bins in bindict.iteritems(): cur_slice, cur_shape = parser.idxs_and_shapes[k] cur_xs = X_uniform[cur_slice] cur_X0, cur_dX_dbins = bininvcdf(cur_xs, cur_bins) X0[cur_slice] = cur_X0 dX_dbins[k] = cur_dX_dbins results = sgd(indexed_loss_fun, batch_idxs, N_iters, X0, V0, np.exp(log_alphas), betas, record_learning_curve=True) dL_dx = results['d_x'] learning_curve = results['learning_curve'] output.append((learning_curve, bindict)) # Update bins with one gradient step. for k, bins in bindict.iteritems(): dL_dbins = np.dot(parser.get(dL_dx, k).flatten(), dX_dbins[k]) bins = bins - dL_dbins * bin_stepsize bins[[0, -1]] = bins[[0, -1]] - dL_dbins[[0, 1]] * bin_stepsize bins.sort() # Sort in place. bindict[k] = bins return output
def run(oiter): # ----- Variable for this run ----- log_alpha_0 = all_log_alpha_0[oiter] print "Running job {0} on {1}".format(oiter + 1, socket.gethostname()) train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) V0 = npr.randn(N_weights) * velocity_scale losses = [] d_losses = [] alpha_0 = np.exp(log_alpha_0) for N_iters in all_N_iters: alphas = np.full(N_iters, alpha_0) betas = np.full(N_iters, beta_0) npr.seed(1) W0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas) losses.append(results['loss_final']) d_losses.append(d_log_loss(alpha_0, results['d_alphas'])) return losses, d_losses
def run(): (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) \ = load_data_subset(N_train_data, N_val_data, N_test_data) batch_idxs = BatchList(N_train_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = parser.N hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (N_weights, )) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N npr.seed(0) hyperparser.set(metas, 'log_L2_reg', log_L2_reg_scale + np.ones(N_weights)) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')) return loss_fun(x, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')) log_prior = -meta_L2_reg * np.dot(L2_reg.ravel(), L2_reg.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_loss = test_loss_fun(results['x_final']) weightparser = parser.new_vect(results['x_final']) l2parser = parser.new_vect(np.exp(hyperparser.get(metas, 'log_L2_reg'))) output.append((learning_curve, validation_loss, test_loss, weightparser[('weights', 0)], l2parser[('weights', 0)])) metas -= results['dMd_meta'] * meta_stepsize print "Meta iteration {0} Valiation loss {1} Test loss {2}"\ .format(i, validation_loss, test_loss) return output
def test_sgd2(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) V0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 3 batch_idxs = BatchList(N_data, batch_size) N_iter = num_epochs * len(batch_idxs) alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, idxs): sub_A = A[idxs, :] return np.dot( np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def meta_loss_fun(w, meta): return np.dot(w, w) + np.dot(meta, meta) def full_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['L_final'] def meta_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['M_final'] result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) d_an = (result['dLd_x'], result['dLd_v'], result['dLd_alphas'], result['dLd_betas'], result['dLd_meta']) d_num = nd(full_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num) d_an = (result['dMd_x'], result['dMd_v'], result['dMd_alphas'], result['dMd_betas'], result['dMd_meta']) d_num = nd(meta_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num)
def run(): val_images, val_labels, test_images, test_labels, _ = load_data( normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] truedatasize = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N fake_data = npr.randn( *(val_images[:N_fake_data, :].shape)) * init_fake_data_scale fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. log_prior = -fake_data_L2_reg * np.dot(meta_params.ravel(), meta_params.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] validation_loss = results['M_final'] fakedatasize = np.std(fake_data) / truedatasize test_loss = test_loss_fun(results['x_final']) output.append((learning_curve, validation_loss, test_loss, fake_data, fakedatasize)) fake_data -= results[ 'dMd_meta'] * data_stepsize # Update data with one gradient step. print "Meta iteration {0} Valiation loss {1} Test loss {2}"\ .format(i, validation_loss, test_loss) return output
def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [ 2, ]) parser.add_shape('second', [ 1, ]) parser.add_shape('third', [ 3, ]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[ i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot( np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N #fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) fake_data = np.zeros(train_images[:N_fake_data, :].shape) one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(0, 10)), 10) # One of each label. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] output.append((learning_curve, fake_data)) fake_data -= results[ 'dMd_meta'] * data_stepsize # Update data with one gradient step. return output
def run(): train_images, train_labels, _, _, _ = load_data() train_images = train_images[:N_data, :] train_labels = train_labels[:N_data, :] batch_idxs = BatchList(N_data, batch_size) iter_per_epoch = len(batch_idxs) N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) def indexed_loss_fun(w, idxs): return loss_fun(w, X=train_images[idxs], T=train_labels[idxs]) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(2) V0 = npr.randn(N_weights) * velocity_scale #W0 = npr.randn(N_weights) * np.exp(log_param_scale) bins = np.linspace(-1, 1, N_bins) * np.exp(log_param_scale) W_uniform = npr.rand(N_weights) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) W0, dW_dbins = bininvcdf(W_uniform, bins) results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, np.exp(log_alphas), betas, record_learning_curve=True) dL_dx = results['d_x'] dL_dbins = np.dot(dL_dx, dW_dbins) learning_curve = results['learning_curve'] output.append((learning_curve, bins)) bins = bins - dL_dbins * bin_stepsize bins[[0, -1]] = bins[[0, -1]] - dL_dbins[[0, 1]] * bin_stepsize bins.sort() # Sort in place. return output
def run(): val_images, val_labels, test_images, test_labels, _ = load_data( normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] true_data_scale = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) npr.seed(0) init_fake_data = npr.randn( *(val_images[:N_fake_data, :].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (1, )) hyperparser.add_weights('fake_data', init_fake_data.shape) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N hyperparser.set(metas, 'log_L2_reg', init_log_L2_reg) hyperparser.set(metas, 'fake_data', init_fake_data) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')[0]) fake_data = hyperparser.get(meta_params, 'fake_data') return loss_fun(x, X=fake_data[idxs], T=fake_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. fake_data = hyperparser.get(meta_params, 'fake_data') log_prior = -fake_data_L2_reg * np.dot(fake_data.ravel(), fake_data.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) output = [] velocity = np.zeros(hyperparser.N) for i in range(N_meta_iter): print "L2 reg is ", np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), "| ", npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_err = frac_err(results['x_final'], test_images, test_labels) fake_data_scale = np.std(hyperparser.get( metas, 'fake_data')) / true_data_scale test_loss = test_loss_fun(results['x_final']) output.append( (learning_curve, validation_loss, test_loss, fake_data_scale, np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), test_err)) # Do meta-SGD with momentum g = results['dMd_meta'] velocity = meta_momentum * velocity - (1.0 - meta_momentum) * g metas += velocity * meta_stepsize print "Meta iteration {0} Validation loss {1} Test loss {2} Test err {3}"\ .format(i, validation_loss, test_loss, test_err) return output, hyperparser.get(metas, 'fake_data')
def run(): (train_images, train_labels),\ (valid_images, valid_labels),\ (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests) batch_idxs = BatchList(N_train, batch_size) N_iters = N_epochs * len(batch_idxs) parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weight_types = len(parser.names) hyperparams = VectorParser() hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg) hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale) hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas) hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas) def indexed_loss_fun(w, log_L2_reg, i): idxs = batch_idxs[i % len(batch_idxs)] partial_vects = [ np.full(parser[name].size, np.exp(log_L2_reg[i])) for i, name in enumerate(parser.names) ] L2_reg_vect = np.concatenate(partial_vects, axis=0) return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect) def train_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=train_images, T=train_labels) def valid_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=valid_images, T=valid_labels) def tests_loss_fun(w, log_L2_reg=0.0): return loss_fun(w, X=tests_images, T=tests_labels) all_learning_curves = [] all_x = [] def hyperloss_grad(hyperparam_vect, i): learning_curve = [] def callback(x, i): if i % len(batch_idxs) == 0: learning_curve.append( loss_fun(x, X=train_images, T=train_labels)) npr.seed(i) N_weights = parser.vect.size V0 = np.zeros(N_weights) cur_hyperparams = hyperparams.new_vect(hyperparam_vect) layer_param_scale = [ np.full(parser[name].size, np.exp(cur_hyperparams['log_param_scale'][i])) for i, name in enumerate(parser.names) ] W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0) alphas = np.exp(cur_hyperparams['log_alphas']) betas = logit(cur_hyperparams['invlogit_betas']) log_L2_reg = cur_hyperparams['log_L2_reg'] results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0, alphas, betas, log_L2_reg, callback=callback) hypergrads = hyperparams.copy() hypergrads['log_L2_reg'] = results['dMd_meta'] weights_grad = parser.new_vect(W0 * results['dMd_x']) hypergrads['log_param_scale'] = [ np.sum(weights_grad[name]) for name in parser.names ] hypergrads['log_alphas'] = results['dMd_alphas'] * alphas hypergrads['invlogit_betas'] = ( results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas'])) all_x.append(results['x_final']) all_learning_curves.append(learning_curve) return hypergrads.vect add_fields = ['train_loss', 'valid_loss', 'tests_loss'] meta_results = {field: [] for field in add_fields + hyperparams.names} def meta_callback(hyperparam_vect, i): print "Meta iter {0}".format(i) x = all_x[-1] cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy()) log_L2_reg = cur_hyperparams['log_L2_reg'] for field in cur_hyperparams.names: meta_results[field].append(cur_hyperparams[field]) meta_results['train_loss'].append(train_loss_fun(x)) meta_results['valid_loss'].append(valid_loss_fun(x)) meta_results['tests_loss'].append(tests_loss_fun(x)) final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha) meta_results['all_learning_curves'] = all_learning_curves parser.vect = None # No need to pickle zeros return meta_results, parser