def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [2,]) parser.add_shape('second', [1,]) parser.add_shape('third', [3,]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights*2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot(np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
def make_parabola(d): parser = VectorParser() parser.add_shape('weights', d) dimscale = np.exp(np.linspace(-3, 3, d)) offset = npr.randn(d) def loss(w, X=0.0, T=0.0, L2_reg=0.0): return np.dot((w - offset) * dimscale, (w - offset)) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(x): return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0 + (1-x[:-1])**2.0) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 500 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(x): return sum(100.0 * (x[1:] - x[:-1]**2.0)**2.0 + (1 - x[:-1])**2.0) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 500 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape('weights', 2) def rosenbrock(w): x = w[1:] y = w[:-1] return sum(100.0 * (x - y**2.0)**2.0 + (1 - y)**2.0 + 200.0 * y) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 800 * logit(rosenbrock(W_vect) / 500) return parser, loss
def make_toy_funs(): parser = VectorParser() parser.add_shape("weights", 2) def rosenbrock(w): x = w[1:] y = w[:-1] return sum(100.0 * (x - y ** 2.0) ** 2.0 + (1 - y) ** 2.0 + 200.0 * y) def loss(W_vect, X=0.0, T=0.0, L2_reg=0.0): return 800 * logit(rosenbrock(W_vect) / 500) return parser, loss
def test_sgd_parser(): N_weights = 6 W0 = 0.1 * npr.randn(N_weights) N_data = 12 batch_size = 4 num_epochs = 4 batch_idxs = BatchList(N_data, batch_size) parser = VectorParser() parser.add_shape('first', [ 2, ]) parser.add_shape('second', [ 1, ]) parser.add_shape('third', [ 3, ]) N_weight_types = 3 alphas = 0.1 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) betas = 0.5 + 0.2 * npr.rand(len(batch_idxs) * num_epochs, N_weight_types) meta = 0.1 * npr.randn(N_weights * 2) A = npr.randn(N_data, N_weights) def loss_fun(W, meta, i=None): idxs = batch_idxs.all_idxs if i is None else batch_idxs[ i % len(batch_idxs)] sub_A = A[idxs, :] return np.dot( np.dot(W + meta[:N_weights] + meta[N_weights:], np.dot(sub_A.T, sub_A)), W) def full_loss(params): (W0, alphas, betas, meta) = params result = sgd_parsed(grad(loss_fun), kylist(W0, alphas, betas, meta), parser) return loss_fun(result, meta) d_num = nd(full_loss, (W0, alphas, betas, meta)) d_an_fun = grad(full_loss) d_an = d_an_fun([W0, alphas, betas, meta]) for i, (an, num) in enumerate(zip(d_an, d_num[0])): assert np.allclose(an, num, rtol=1e-3, atol=1e-4), \ "Type {0}, diffs are: {1}".format(i, an - num)
m = b1t * g + (1 - b1t) * m v = b2 * (g**2) + (1 - b2) * v mhat = m / (1 - (1 - b1)**(i + 1)) vhat = v / (1 - (1 - b2)**(i + 1)) x -= step_size * mhat / (np.sqrt(vhat) + eps) return x # -- # Make NN functions parser = VectorParser() for i, shape in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])): parser.add_shape(('weights', i), shape) parser.add_shape(('biases', i), (1, shape[1])) def pred_fun(W_vect, X): """Outputs normalized log-probabilities.""" W = parser.new_vect(W_vect) cur_units = X N_iter = len(layer_sizes) - 1 for i in range(N_iter): cur_W = W[('weights', i)] cur_B = W[('biases', i)] cur_units = np.dot(cur_units, cur_W) + cur_B if i == (N_iter - 1): cur_units = cur_units - logsumexp(cur_units, axis=1) else: