def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) * init_fake_data_scale fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] validation_loss = results['M_final'] output.append((learning_curve, validation_loss, fake_data)) fake_data -= results['dMd_meta'] * data_stepsize # Update data with one gradient step. print "Meta iteration {0} Valiation loss {1}".format(i, validation_loss) return output
def run(): (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) \ = load_data_subset(N_train_data, N_val_data, N_test_data) batch_idxs = BatchList(N_train_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = parser.N hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (N_weights, )) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N npr.seed(0) hyperparser.set(metas, 'log_L2_reg', log_L2_reg_scale + np.ones(N_weights)) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')) return loss_fun(x, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')) log_prior = -meta_L2_reg * np.dot(L2_reg.ravel(), L2_reg.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_loss = test_loss_fun(results['x_final']) weightparser = parser.new_vect(results['x_final']) l2parser = parser.new_vect(np.exp(hyperparser.get(metas, 'log_L2_reg'))) output.append((learning_curve, validation_loss, test_loss, weightparser[('weights', 0)], l2parser[('weights', 0)])) metas -= results['dMd_meta'] * meta_stepsize print "Meta iteration {0} Valiation loss {1} Test loss {2}"\ .format(i, validation_loss, test_loss) return output
def run(): (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) = load_data_subset( N_train_data, N_val_data, N_test_data ) batch_idxs = BatchList(N_train_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = parser.N hyperparser = WeightsParser() hyperparser.add_weights("log_L2_reg", (N_weights,)) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N npr.seed(0) hyperparser.set(metas, "log_L2_reg", log_L2_reg_scale + np.ones(N_weights)) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg")) return loss_fun(x, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg")) log_prior = -meta_L2_reg * np.dot(L2_reg.ravel(), L2_reg.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results["learning_curve"] validation_loss = results["M_final"] test_loss = test_loss_fun(results["x_final"]) output.append( ( learning_curve, validation_loss, test_loss, parser.get(results["x_final"], (("weights", 0))), parser.get(np.exp(hyperparser.get(metas, "log_L2_reg")), (("weights", 0))), ) ) metas -= results["dMd_meta"] * meta_stepsize print "Meta iteration {0} Valiation loss {1} Test loss {2}".format(i, validation_loss, test_loss) return output
def test_sgd2(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) V0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 3 batch_idxs = BatchList(N_data, batch_size) N_iter = num_epochs * len(batch_idxs) alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, idxs): sub_A = A[idxs, :] return np.dot( np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def meta_loss_fun(w, meta): return np.dot(w, w) + np.dot(meta, meta) def full_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['L_final'] def meta_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['M_final'] result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) d_an = (result['dLd_x'], result['dLd_v'], result['dLd_alphas'], result['dLd_betas'], result['dLd_meta']) d_num = nd(full_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num) d_an = (result['dMd_x'], result['dMd_v'], result['dMd_alphas'], result['dMd_betas'], result['dMd_meta']) d_num = nd(meta_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num)
def run(): val_images, val_labels, test_images, test_labels, _ = load_data( normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] truedatasize = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N fake_data = npr.randn( *(val_images[:N_fake_data, :].shape)) * init_fake_data_scale fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. log_prior = -fake_data_L2_reg * np.dot(meta_params.ravel(), meta_params.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] validation_loss = results['M_final'] fakedatasize = np.std(fake_data) / truedatasize test_loss = test_loss_fun(results['x_final']) output.append((learning_curve, validation_loss, test_loss, fake_data, fakedatasize)) fake_data -= results[ 'dMd_meta'] * data_stepsize # Update data with one gradient step. print "Meta iteration {0} Valiation loss {1} Test loss {2}"\ .format(i, validation_loss, test_loss) return output
def test_sgd2(): N_weights = 5 W0 = 0.1 * npr.randn(N_weights) V0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 3 batch_idxs = BatchList(N_data, batch_size) N_iter = num_epochs * len(batch_idxs) alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, idxs): sub_A = A[idxs, :] return np.dot(np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def meta_loss_fun(w, meta): return np.dot(w, w) + np.dot(meta, meta) def full_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['L_final'] def meta_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['M_final'] result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) d_an = (result['dLd_x'], result['dLd_v'], result['dLd_alphas'], result['dLd_betas'], result['dLd_meta']) d_num = nd(full_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num) d_an = (result['dMd_x'], result['dMd_v'], result['dMd_alphas'], result['dMd_betas'], result['dMd_meta']) d_num = nd(meta_loss, W0, V0, alphas, betas, meta) for i, (an, num) in enumerate(zip(d_an, d_num)): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num) print "Type {0}, diffs are: {1}".format(i, an - num)
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N #fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) fake_data = np.zeros(train_images[:N_fake_data, :].shape) one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(0, 10)), 10) # One of each label. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data) learning_curve = results['learning_curve'] output.append((learning_curve, fake_data)) fake_data -= results[ 'dMd_meta'] * data_stepsize # Update data with one gradient step. return output
def run(): train_images, train_labels, _, _, _ = load_data(normalize=True) train_images = train_images[:N_real_data, :] train_labels = train_labels[:N_real_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True) N_weights = parser.N # fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) fake_data = np.zeros(train_images[:N_fake_data, :].shape) one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(0, 10)), 10) # One of each label. def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs]) def meta_loss_fun(x): # To be optimized in the outer loop. return loss_fun(x, X=train_images, T=train_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) output = [] for i in range(N_meta_iter): print "Meta iteration {0}".format(i) results = sgd2( indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data ) learning_curve = results["learning_curve"] output.append((learning_curve, fake_data)) fake_data -= results["dMd_meta"] * data_stepsize # Update data with one gradient step. return output
def run(): val_images, val_labels, test_images, test_labels, _ = load_data( normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] true_data_scale = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) npr.seed(0) init_fake_data = npr.randn( *(val_images[:N_fake_data, :].shape)) * init_fake_data_scale one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (1, )) hyperparser.add_weights('fake_data', init_fake_data.shape) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N hyperparser.set(metas, 'log_L2_reg', init_log_L2_reg) hyperparser.set(metas, 'fake_data', init_fake_data) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg = np.exp(hyperparser.get(meta_params, 'log_L2_reg')[0]) fake_data = hyperparser.get(meta_params, 'fake_data') return loss_fun(x, X=fake_data[idxs], T=fake_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. fake_data = hyperparser.get(meta_params, 'fake_data') log_prior = -fake_data_L2_reg * np.dot(fake_data.ravel(), fake_data.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) output = [] velocity = np.zeros(hyperparser.N) for i in range(N_meta_iter): print "L2 reg is ", np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), "| ", npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_err = frac_err(results['x_final'], test_images, test_labels) fake_data_scale = np.std(hyperparser.get( metas, 'fake_data')) / true_data_scale test_loss = test_loss_fun(results['x_final']) output.append( (learning_curve, validation_loss, test_loss, fake_data_scale, np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), test_err)) # Do meta-SGD with momentum g = results['dMd_meta'] velocity = meta_momentum * velocity - (1.0 - meta_momentum) * g metas += velocity * meta_stepsize print "Meta iteration {0} Validation loss {1} Test loss {2} Test err {3}"\ .format(i, validation_loss, test_loss, test_err) return output, hyperparser.get(metas, 'fake_data')
def run(): val_images, val_labels, test_images, test_labels, _ = load_data(normalize=True) val_images = val_images[:N_val_data, :] val_labels = val_labels[:N_val_data, :] true_data_scale = np.std(val_images) test_images = test_images[:N_test_data, :] test_labels = test_labels[:N_test_data, :] batch_idxs = BatchList(N_fake_data, batch_size) parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes) N_weights = len(parser.vect) npr.seed(0) init_fake_data = npr.randn(*(val_images[:N_fake_data, :].shape)) * init_fake_data_scale one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int) fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes) # One of each. hyperparser = WeightsParser() hyperparser.add_weights('log_L2_reg', (1,)) hyperparser.add_weights('fake_data', init_fake_data.shape) metas = np.zeros(hyperparser.N) print "Number of hyperparameters to be trained:", hyperparser.N hyperparser.set(metas, 'log_L2_reg', init_log_L2_reg) hyperparser.set(metas, 'fake_data', init_fake_data) def indexed_loss_fun(x, meta_params, idxs): # To be optimized by SGD. L2_reg=np.exp(hyperparser.get(meta_params, 'log_L2_reg')[0]) fake_data=hyperparser.get(meta_params, 'fake_data') return loss_fun(x, X=fake_data[idxs], T=fake_labels[idxs], L2_reg=L2_reg) def meta_loss_fun(x, meta_params): # To be optimized in the outer loop. fake_data=hyperparser.get(meta_params, 'fake_data') log_prior = -fake_data_L2_reg * np.dot(fake_data.ravel(), fake_data.ravel()) return loss_fun(x, X=val_images, T=val_labels) - log_prior def test_loss_fun(x): # To measure actual performance. return loss_fun(x, X=test_images, T=test_labels) log_alphas = np.full(N_iters, log_alpha_0) betas = np.full(N_iters, beta_0) output = [] for i in range(N_meta_iter): print "L2 reg is ", np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), "| ", npr.seed(0) v0 = npr.randn(N_weights) * velocity_scale x0 = npr.randn(N_weights) * np.exp(log_param_scale) results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas) learning_curve = results['learning_curve'] validation_loss = results['M_final'] test_err = frac_err(results['x_final'], test_images, test_labels) fake_data_scale = np.std(hyperparser.get(metas, 'fake_data')) / true_data_scale test_loss = test_loss_fun(results['x_final']) output.append((learning_curve, validation_loss, test_loss, fake_data_scale, np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), test_err)) metas -= results['dMd_meta'] * meta_stepsize print "Meta iteration {0} Validation loss {1} Test loss {2} Test err {3}"\ .format(i, validation_loss, test_loss, test_err) return output, hyperparser.get(metas, 'fake_data')
def meta_loss(W0, V0, alphas, betas, meta): result = sgd2(loss_fun, meta_loss_fun, batch_idxs, N_iter, W0, V0, alphas, betas, meta) return result['M_final']