def test_numpy(X): dsize = X.shape[1] cov = X @ X.T # cov = cov/np.max(cov) precision = np.linalg.inv(cov) n = cov.shape[0] B = np.zeros((4, 4)) lr = 0.01 losses = [] for i in range(100): R = B @ X - X G = 2 * R @ X.T np.fill_diagonal(G, 0) resvar = np.asarray([np.linalg.norm(r)**2 for r in R]) losses.append(np.sum(resvar)) D2 = np.diag(1 / resvar) precision2 = D2 @ (np.identity(n) - B) err = (precision2 - precision) loss2 = np.trace(err @ err.T) B = B - lr * G print(loss2) test_points = 10 losses = np.asarray(losses)[:test_points] target_losses = [ 118., 41.150800000000004, 33.539355199999996, 29.747442032320002, 27.450672271574934, 25.95846376879459, 24.917943341139274, 24.139761502111114, 23.519544126307142, 22.998235729589265 ] u.check_equal(losses[:test_points], target_losses[:test_points]) print('mismatch is ', np.max(losses - target_losses))
def test_diagonal_hessian(): u.seed_random(1) A, model = create_toy_model() activations = {} def save_activations(layer, a, _): if layer != model.layers[0]: return activations[layer] = a with autograd_lib.module_hook(save_activations): Y = model(A.t()) loss = torch.sum(Y * Y) / 2 hess = [0] def compute_hess(layer, _, B): if layer != model.layers[0]: return A = activations[layer] hess[0] += torch.einsum("ni,nj->ij", B * B, A * A).reshape(-1) with autograd_lib.module_hook(compute_hess): autograd_lib.backprop_identity(Y, retain_graph=True) # check against autograd hess0 = u.hessian(loss, model.layers[0].weight).reshape([4, 4]) u.check_equal(hess[0], torch.diag(hess0)) # check against manual solution u.check_equal(hess[0], [425., 225., 680., 360.])
def test_full_hessian_multibatch(): A, model = create_toy_model() data = A.t() data = data.repeat(3, 1) n = float(len(data)) activations = {} hess = defaultdict(float) def save_activations(layer, a, _): activations[layer] = a def compute_hessian(layer, _, B): A = activations[layer] BA = torch.einsum("nl,ni->nli", B, A) hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) for x in data: with autograd_lib.module_hook(save_activations): y = model(x) loss = torch.sum(y * y) / 2 with autograd_lib.module_hook(compute_hessian): autograd_lib.backprop_identity(y) result = hess[model.layers[0]] # check result against autograd loss = u.least_squares(model(data), aggregation='sum') hess0 = u.hessian(loss, model.layers[0].weight) u.check_equal(hess0, result)
def test_kfac_hessian(): A, model = create_toy_model() data = A.t() data = data.repeat(7, 1) n = float(len(data)) activations = {} hess = defaultdict(lambda: AttrDefault(float)) def save_activations(layer, a, _): activations[layer] = a def compute_hessian(layer, _, B): A = activations[layer] hess[layer].AA += torch.einsum("ni,nj->ij", A, A) hess[layer].BB += torch.einsum("ni,nj->ij", B, B) for x in data: with autograd_lib.module_hook(save_activations): y = model(x) o = y.shape[1] loss = torch.sum(y * y) / 2 with autograd_lib.module_hook(compute_hessian): autograd_lib.backprop_identity(y) hess0 = hess[model.layers[0]] result = u.kron(hess0.BB / n, hess0.AA / o) # check result against autograd loss = u.least_squares(model(data), aggregation='sum') hess0 = u.hessian(loss, model.layers[0].weight).reshape(4, 4) u.check_equal(hess0, result)
def test_kfac_jacobian_mnist(): u.seed_random(1) data_width = 3 d = [data_width**2, 8, 10] model: u.SimpleMLP = u.SimpleMLP(d, nonlin=False) autograd_lib.register(model) batch_size = 4 stats_steps = 2 n = batch_size * stats_steps dataset = u.TinyMNIST(dataset_size=n, data_width=data_width, original_targets=True) trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False) train_iter = iter(trainloader) loss_fn = torch.nn.CrossEntropyLoss() activations = {} jacobians = defaultdict(lambda: AttrDefault(float)) total_data = [] # sum up statistics over n examples for train_step in range(stats_steps): data, targets = next(train_iter) total_data.append(data) activations = {} def save_activations(layer, A, _): activations[layer] = A jacobians[layer].AA += torch.einsum("ni,nj->ij", A, A) with autograd_lib.module_hook(save_activations): output = model(data) loss = loss_fn(output, targets) def compute_jacobian(layer, _, B): A = activations[layer] jacobians[layer].BB += torch.einsum("ni,nj->ij", B, B) jacobians[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A) with autograd_lib.module_hook(compute_jacobian): autograd_lib.backward_jacobian(output) for layer in model.layers: jacobian0 = jacobians[layer] jacobian_full = torch.einsum('kl,ij->kilj', jacobian0.BB / n, jacobian0.AA / n) jacobian_diag = jacobian0.diag / n J = u.jacobian(model(torch.cat(total_data)), layer.weight) J_autograd = torch.einsum('noij,nokl->ijkl', J, J) / n u.check_equal(jacobian_full, J_autograd) u.check_equal(jacobian_diag, torch.einsum('ikik->ik', J_autograd))
def test_gradient_norms(): """Per-example gradient norms.""" u.seed_random(1) A, model = create_toy_model() activations = {} def save_activations(layer, a, _): if layer != model.layers[0]: return activations[layer] = a with autograd_lib.module_hook(save_activations): Y = model(A.t()) loss = torch.sum(Y * Y) / 2 norms = {} def compute_norms(layer, _, b): if layer != model.layers[0]: return a = activations[layer] del activations[layer] # release memory kept by activations norms[layer] = (a * a).sum(dim=1) * (b * b).sum(dim=1) with autograd_lib.module_hook(compute_norms): loss.backward() u.check_equal(norms[model.layers[0]], [3493250, 9708800])
def vargroup_test(): sess = tf.InteractiveSession() v1 = tf.Variable(1.) v2 = tf.Variable(1.) a = VarList([v1, v2]) b = a.copy() sess.run(tf.global_variables_initializer()) sess.run(a.sub(b, weight=1)) u.check_equal(v1.eval(), 0)
def test_kfac_fisher_mnist(): u.seed_random(1) data_width = 3 d = [data_width**2, 8, 10] model: u.SimpleMLP = u.SimpleMLP(d, nonlin=False) autograd_lib.register(model) batch_size = 4 stats_steps = 2 n = batch_size * stats_steps dataset = u.TinyMNIST(dataset_size=n, data_width=data_width, original_targets=True) trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False) train_iter = iter(trainloader) loss_fn = torch.nn.CrossEntropyLoss() activations = {} fishers = defaultdict(lambda: AttrDefault(float)) total_data = [] # sum up statistics over n examples for train_step in range(stats_steps): data, targets = next(train_iter) total_data.append(data) activations = {} def save_activations(layer, A, _): activations[layer] = A fishers[layer].AA += torch.einsum("ni,nj->ij", A, A) with autograd_lib.module_hook(save_activations): output = model(data) loss = loss_fn(output, targets) * len( data) # remove data normalization def compute_fisher(layer, _, B): A = activations[layer] fishers[layer].BB += torch.einsum("ni,nj->ij", B, B) fishers[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A) with autograd_lib.module_hook(compute_fisher): autograd_lib.backward_jacobian(output) for layer in model.layers: fisher0 = fishers[layer] fisher_full = torch.einsum('kl,ij->kilj', fisher0.BB / n, fisher0.AA / n) fisher_diag = fisher0.diag / n u.check_equal(torch.einsum('ikik->ik', fisher_full), fisher_diag)
def test_full_hessian_xent_kfac2(): """Test with uneven layers.""" u.seed_random(1) torch.set_default_dtype(torch.float64) batch_size = 1 d = [3, 2] o = d[-1] n = batch_size train_steps = 1 model: u.SimpleModel = u.SimpleFullyConnected2(d, nonlin=True, bias=False) autograd_lib.register(model) loss_fn = torch.nn.CrossEntropyLoss() data = u.to_logits(torch.tensor([[0.7, 0.2, 0.1]])) targets = torch.tensor([0]) data = data.repeat([3, 1]) targets = targets.repeat([3]) n = len(data) activations = {} hess = defaultdict(lambda: AttrDefault(float)) for i in range(n): def save_activations(layer, A, _): activations[layer] = A hess[layer].AA += torch.einsum("ni,nj->ij", A, A) with autograd_lib.module_hook(save_activations): data_batch = data[i:i + 1] targets_batch = targets[i:i + 1] Y = model(data_batch) o = Y.shape[1] loss = loss_fn(Y, targets_batch) def compute_hess(layer, _, B): hess[layer].BB += torch.einsum("ni,nj->ij", B, B) with autograd_lib.module_hook(compute_hess): autograd_lib.backward_hessian(Y, loss='CrossEntropy') # expand hess_factored = hess[model.layers[0]] hess0 = torch.einsum('kl,ij->kilj', hess_factored.BB / n, hess_factored.AA / o) # hess for sum loss hess0 /= n # hess for mean loss # check against autograd # 0.1459 Y = model(data) loss = loss_fn(Y, targets) hess_autograd = u.hessian(loss, model.layers[0].weight) u.check_equal(hess_autograd, hess0)
def test_full_hessian_xent_multibatch(): u.seed_random(1) torch.set_default_dtype(torch.float64) batch_size = 1 d = [2, 2] o = d[-1] n = batch_size train_steps = 1 model: u.SimpleModel = u.SimpleFullyConnected2(d, nonlin=True, bias=True) model.layers[0].weight.data.copy_(torch.eye(2)) autograd_lib.register(model) loss_fn = torch.nn.CrossEntropyLoss() data = u.to_logits(torch.tensor([[0.7, 0.3]])) targets = torch.tensor([0]) data = data.repeat([3, 1]) targets = targets.repeat([3]) n = len(data) activations = {} hess = defaultdict(float) def save_activations(layer, a, _): activations[layer] = a for i in range(n): with autograd_lib.module_hook(save_activations): data_batch = data[i:i + 1] targets_batch = targets[i:i + 1] Y = model(data_batch) loss = loss_fn(Y, targets_batch) def compute_hess(layer, _, B): A = activations[layer] BA = torch.einsum("nl,ni->nli", B, A) hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) with autograd_lib.module_hook(compute_hess): autograd_lib.backward_hessian(Y, loss='CrossEntropy') # check against autograd # 0.1459 Y = model(data) loss = loss_fn(Y, targets) hess_autograd = u.hessian(loss, model.layers[0].weight) hess0 = hess[model.layers[0]] / n u.check_equal(hess_autograd, hess0)
def test_full_hessian(): u.seed_random(1) A, model = create_toy_model() data = A.t() # data = data.repeat(3, 1) activations = {} hess = defaultdict(float) def save_activations(layer, a, _): activations[layer] = a with autograd_lib.module_hook(save_activations): Y = model(A.t()) loss = torch.sum(Y * Y) / 2 def compute_hess(layer, _, B): A = activations[layer] n = A.shape[0] di = A.shape[1] do = B.shape[1] BA = torch.einsum("nl,ni->nli", B, A) hess[layer] += torch.einsum('nli,nkj->likj', BA, BA) with autograd_lib.module_hook(compute_hess): autograd_lib.backprop_identity(Y, retain_graph=True) # check against autograd hess_autograd = u.hessian(loss, model.layers[0].weight) hess0 = hess[model.layers[0]] u.check_equal(hess_autograd, hess0) # check against manual solution u.check_equal(hess0.reshape(4, 4), [[425, -75, 170, -30], [-75, 225, -30, 90], [170, -30, 680, -120], [-30, 90, -120, 360]])
def simple_newton_kfac_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # inverse Hessian blocks iblocks = u.empty_grid(n+1, n+1) for i in range(1, n+1): for j in range(1, n+1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: acov = A[i] @ t(A[j]) bcov = Bn[i] @ t(Bn[j]) / dsize; term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j]=term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, # 6.88354*10^-33, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def sparse_autoencoder_cost_tf(theta, visible_size, hidden_size, lambda_, sparsity_param, beta, data): # TODO: get rid of b # def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] # fs = [10000, 28*28, 196, 28*28] # n = len(fs)-2 # W = [None]*n W1 = theta[0:hidden_size * visible_size].reshape(hidden_size, visible_size, order='F') W2 = theta[hidden_size * visible_size:2 * hidden_size * visible_size].reshape(visible_size, hidden_size, order='F') b1 = theta[2 * hidden_size * visible_size:2 * hidden_size * visible_size + hidden_size] b2 = theta[2 * hidden_size * visible_size + hidden_size:] init_dict = {} def init_var(val, name): holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name + "_var") init_dict[holder] = val return var W1_ = init_var(W1, "W1") W2_ = init_var(W2, "W2") b1_ = init_var(u.v2c_np(b1), "b1") b2_ = init_var(u.v2c_np(b2), "b2") data_ = init_var(data, "data") sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # Number of training examples m = data.shape[1] a1 = data a1_ = data_ # Forward propagation z2 = W1.dot(a1) + np.tile(b1, (m, 1)).transpose() z2_ = tf.matmul(W1_, a1_) + b1_ a2 = sigmoid(z2) a2_ = tf.sigmoid(z2_) z3 = W2.dot(a2) + np.tile(b2, (m, 1)).transpose() z3_ = tf.matmul(W2_, a2_) + b2_ a3 = sigmoid(z3) a3_ = tf.sigmoid(z3_) # Sparsity rho_hat = np.sum(a2, axis=1) / m rho_hat_ = tf.reduce_sum(a2_, axis=1, keep_dims=True) / m rho = np.tile(sparsity_param, hidden_size) # ValueError: Shape must be rank 1 but is rank 0 for 'Tile_2' (op: 'Tile') with input shapes: [], []. rho_ = tf.constant(sparsity_param, dtype=dtype) #tf.ones((hidden_size, 1), dtype=dtype)*sparsity_param u.check_equal(sess.run(a3_), a3) u.check_equal(sess.run(a2_), a2) u.check_equal(sess.run(a1_), a1) u.check_equal( tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_)).eval(), np.sum(KL_divergence(rho, rho_hat))) # Cost function cost = np.sum((a3 - a1) ** 2) / (2 * m) + \ (lambda_ / 2) * (np.sum(W1 ** 2) + np.sum(W2 ** 2)) + \ beta * np.sum(KL_divergence(rho, rho_hat)) cost_ = tf.reduce_sum((a3_ - a1_) ** 2) / (2 * m) + \ (lambda_ / 2) * (tf.reduce_sum(W1_ ** 2) + \ tf.reduce_sum(W2_ ** 2)) + \ beta * tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_)) return sess.run(cost_)
def relu_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_relu_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [4,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0, name="X0") Y = tf.constant(Y0, name="Y0") W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): if i == 0: A[i+1] = X else: A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1))) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.1, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = (-err/dsize)*u.relu_mask(A[n+1]) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] if i > 0: # there's no relu on first matrix B[i] = B[i]*u.relu_mask(A[i+1]) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv", delimiter= ",") observed_losses = [] # From accompanying notebook # {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892, # 0.00156137, 0.00153857, 0.00153051, 0.00152593} for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def simple_newton_bd_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # Create U's U = [list(range(n+1)) for _ in range(n+1)] for bottom in range(n+1): for top in range(n+1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom-1)) for i in range(bottom, top+1): prod = prod@t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n+1)) for _ in range(n+1)] for i in range(1, n+1): for j in range(1, n+1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i+1][j-1]) else: term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j])) blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] #hess = u.concat_blocks(blocks) ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) # ihess = u.pseudo_inverse(hess) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, # 1.21455*10^-32, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def compare(a, b, msg): infnorm = np.linalg.norm(a - b, np.inf) l2norm = np.linalg.norm(a - b) m = min(np.max(a), np.max(b)) print("Dtype %s, %s, l_inf %s, l2 %s" % (dtype_name, msg, infnorm / eps, l2norm / eps)) assert len(autograds) == n + 1 for i in range(1, n + 1): op = autograds[i].op assert op.op_def.name == 'MatMul' assert op.get_attr("transpose_a") == False assert op.get_attr("transpose_b") == True autoB = op.inputs[0] autoA = op.inputs[1] u.check_equal(op.inputs[1], A[i], rtol=1e-5, atol=1e-7) u.check_equal(autograds[i], dW[i], rtol=1e-5, atol=1e-7) u.check_equal(op.inputs[0], B[i] / dsize, rtol=1e-6, atol=1e-7) compare(A[i].eval(), autoA.eval(), "A[%d]" % (i, )) compare((B[i] / dsize).eval(), autoB.eval(), "B[%d]" % (i, )) compare(dW[i].eval(), autograds[i].eval(), "dW[%d]" % (i, )) lr0, loss0 = sess.run([lr, loss]) save_params_op.run() util.dump32(Wf_copy, "%s_param_%d" % (prefix, step)) util.dump32(grad, "%s_grad_%d" % (prefix, step)) util.dump32(pre_grad, "%s_pre_grad_%d" % (prefix, step)) # util.dump32(A[1], "%s_param_%d"%(prefix, step)) # regular inverse becomes unstable when grad norm exceeds 1
def simple_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(1.0, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = -err/dsize for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361, # 0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772, for i in range(20): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
# load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize)
train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) [v.initializer.run() for v in opt_vars] losses = [] u.record_time() for i in range(num_steps): loss0 = model.loss.eval() losses.append(loss0) print("Loss ", loss0) kfac.model.advance_batch() kfac.update_stats() model.advance_batch() grad.update() grad_new.update() train_op.run() u.record_time() if len(sys.argv) > 1 and sys.argv[1] == 'record': u.dump(losses, prefix + "_losses.csv") sys.exit() u.summarize_time() targets = np.loadtxt("data/kfac_refactor_test7_losses.csv", delimiter=",") print("Difference is ", np.linalg.norm(np.asarray(losses) - targets)) result = u.check_equal(losses, targets, rtol=1e-4) print("Test passed: %s" % (result, ))
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): rewrite_options = None try: from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) except: pass optimizer_options = tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) gpu_options = tf.GPUOptions(allow_growth=False) config = tf.ConfigProto(graph_options=graph_options, gpu_options=gpu_options, log_device_placement=False) sess = tf.InteractiveSession(config=config) u.register_default_session( sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() kfac_lib.numeric_inverse = args.numeric_inverse with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda) start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step % args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time() - start_time start_time = time.time() print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" % (elapsed * 1e3, step, loss0, vloss0)) if args.method == 'kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() with u.timeit("grad.update"): grad.update() with kfac.read_lock(): grad_new.update() u.run(train_op) u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/' + release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets) assert u.last_time() < 800, "Expected 648 on GTX 1080"
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): gpu_options = tf.GPUOptions(allow_growth=False) sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options)) u.register_default_session(sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda) writer = u.BufferedWriter(outfn, 60) # get rid? start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step%args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time()-start_time print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0, vloss0)) writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0)) if args.method=='kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() grad.update() with kfac.read_lock(): grad_new.update() train_op.run() u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/'+release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets)
def main(): u.seed_random(1) logdir = u.create_local_logdir(args.logdir) run_name = os.path.basename(logdir) gl.event_writer = SummaryWriter(logdir) print(f"Logging to {run_name}") d1 = args.data_width ** 2 assert args.data_width == args.targets_width o = d1 n = args.stats_batch_size d = [d1, 30, 30, 30, 20, 30, 30, 30, d1] # small values for debugging # loss_type = 'LeastSquares' loss_type = 'CrossEntropy' args.wandb = 0 args.stats_steps = 10 args.train_steps = 10 args.stats_batch_size = 10 args.data_width = 2 args.targets_width = 2 args.nonlin = False d1 = args.data_width ** 2 d2 = 2 d3 = args.targets_width ** 2 if loss_type == 'CrossEntropy': d3 = 10 o = d3 n = args.stats_batch_size d = [d1, d2, d3] dsize = max(args.train_batch_size, args.stats_batch_size)+1 model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin) model = model.to(gl.device) try: # os.environ['WANDB_SILENT'] = 'true' if args.wandb: wandb.init(project='curv_train_tiny', name=run_name) wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['method'] = args.method wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") #optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9) optimizer = torch.optim.Adam(model.parameters(), lr=0.03) # make 10x smaller for least-squares loss dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True) train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_iter = None if not args.full_batch: stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) test_iter = u.infinite_iter(test_loader) if loss_type == 'LeastSquares': loss_fn = u.least_squares elif loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.token_count = 0 last_outer = 0 val_losses = [] for step in range(args.stats_steps): if last_outer: u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)}) last_outer = time.perf_counter() with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) print("val_loss", val_loss.item()) val_losses.append(val_loss.item()) u.log_scalar(val_loss=val_loss.item()) # compute stats if args.full_batch: data, targets = dataset.data, dataset.targets else: data, targets = next(stats_iter) # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) for (i, layer) in enumerate(model.layers): # input/output layers are unreasonably expensive if not using Kronecker factoring if d[i]>50 or d[i+1]>50: print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats') continue if args.skip_stats: continue s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations assert A_t.shape == (n, d[i]) # add factor of n because backprop takes loss averaged over batch, while we need per-example loss B_t = layer.backprops_list[0] * n assert B_t.shape == (n, d[i + 1]) with u.timeit(f"khatri_g-{i}"): G = u.khatri_rao_t(B_t, A_t) # batch loss Jacobian assert G.shape == (n, d[i] * d[i + 1]) g = G.sum(dim=0, keepdim=True) / n # average gradient assert g.shape == (1, d[i] * d[i + 1]) u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1) if args.autograd_check: u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad) u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad) s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel() # proportion of activations that are zero s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma)/s.sigma_l2 lambda_regularizer = args.lmb * torch.eye(d[i + 1]*d[i]).to(gl.device) H = layer.weight.hess with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H+lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = layer.weight.data.flatten().norm() u.nan_check(H) if args.autograd_check: model.zero_grad() output = model(data) loss = loss_fn(output, targets) H_autograd = u.hessian(loss, layer.weight) H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1]) u.check_close(H, H_autograd) # u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}') def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2)) with u.timeit(f"pinvH-{i}"): pinvH = u.pinv(H) with u.timeit(f'curv-{i}'): s.grad_curv = curv_direction(g) ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999 s.step_max = 2 / s.H_l2 s.step_min = torch.tensor(2) / torch.trace(H) s.newton_fro = ndir.flatten().norm() # frobenius norm of Newton update s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2) # replace with "quadratic_form" s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): p_sigma = u.lyapunov_svd(H, sigma) if u.has_nan(p_sigma) and args.compute_rho: # use expensive method print('using expensive method') import pdb; pdb.set_trace() H0, sigma0 = u.to_numpys(H, sigma) p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0) p_sigma = torch.tensor(p_sigma).to(gl.device) if u.has_nan(p_sigma): # import pdb; pdb.set_trace() s.psigma_erank = H.shape[0] s.rho = 1 else: s.psigma_erank = u.sym_erank(p_sigma) s.rho = H.shape[0] / s.psigma_erank with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n # Faster approaches for noise variance computation # s.noise_variance = torch.trace(H.inverse() @ sigma) # try: # # this fails with singular sigma # s.noise_variance = torch.trace(torch.solve(sigma, H)[0]) # # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0]) # pass # except RuntimeError as _: s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank u.log_scalars(u.nest_stats(layer.name, s)) # gradient steps with u.timeit('inner'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() # u.log_scalar(train_loss=loss.item()) if args.method != 'newton': optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1-args.weight_decay) else: for (layer_idx, layer) in enumerate(model.layers): param: torch.nn.Parameter = layer.weight param_data: torch.Tensor = param.data param_data.copy_(param_data - 0.1 * param.grad) if layer_idx != 1: # only update 1 layer with Newton, unstable otherwise continue u.nan_check(layer.weight.pre) u.nan_check(param.grad.flatten()) u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre) param_new_flat = u.v2r(param_data.flatten()) - u.v2r(param.grad.flatten()) @ layer.weight.pre u.nan_check(param_new_flat) param_data.copy_(param_new_flat.reshape(param_data.shape)) gl.token_count += data.shape[0] gl.event_writer.close()
if step % report_frequency == 0: print("Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"%(step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step>adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs(target_delta)>1e-6 and adaptive_step: print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio)) print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step>0 and i%50 == 0 and slope_ratio>0.90 and adaptive_step: print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f"%(lr0*growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*growth_rate}) u.record_time() u.summarize_time() if len(sys.argv)>1 and sys.argv[1]=='record': u.dump(losses, prefix+"_losses.csv") sys.exit() targets = np.loadtxt("data/"+prefix+"_losses.csv", delimiter=",") print("Difference is ", np.linalg.norm(np.asarray(losses)-targets)) u.check_equal(losses, targets, rtol=1e-5) print("Test passed")
def test_conv_grad(): """Test per-example gradient computation for conv layer. """ u.seed_random(1) N, Xc, Xh, Xw = 3, 2, 3, 7 dd = [Xc, 2] Kh, Kw = 2, 3 Oh, Ow = Xh - Kh + 1, Xw - Kw + 1 model = u.SimpleConvolutional(dd, kernel_size=(Kh, Kw), bias=True).double() weight_buffer = model.layers[0].weight.data # output channels, input channels, height, width assert weight_buffer.shape == (dd[1], dd[0], Kh, Kw) input_dims = N, Xc, Xh, Xw size = int(np.prod(input_dims)) X = torch.arange(0, size).reshape(*input_dims).double() def loss_fn(data): err = data.reshape(len(data), -1) return torch.sum(err * err) / 2 / len(data) layer = model.layers[0] output = model(X) loss = loss_fn(output) loss.backward() u.check_equal(layer.activations, X) assert layer.backprops_list[0].shape == layer.output.shape assert layer.output.shape == (N, dd[1], Oh, Ow) out_unf = layer.weight.view(layer.weight.size(0), -1) @ unfold(layer.activations, (Kh, Kw)) assert out_unf.shape == (N, dd[1], Oh * Ow) reshaped_bias = layer.bias.reshape(1, dd[1], 1) # (Co,) -> (1, Co, 1) out_unf = out_unf + reshaped_bias u.check_equal(fold(out_unf, (Oh, Ow), (1, 1)), output) # two alternative ways of reshaping u.check_equal(out_unf.view(N, dd[1], Oh, Ow), output) # Unfold produces patches with output dimension merged, while in backprop they are not merged # Hence merge the output (width/height) dimension assert unfold(layer.activations, (Kh, Kw)).shape == (N, Xc * Kh * Kw, Oh * Ow) assert layer.backprops_list[0].shape == (N, dd[1], Oh, Ow) grads_bias = layer.backprops_list[0].sum(dim=(2, 3)) * N mean_grad_bias = grads_bias.sum(dim=0) / N u.check_equal(mean_grad_bias, layer.bias.grad) Bt = layer.backprops_list[0] * N # remove factor of N applied during loss batch averaging assert Bt.shape == (N, dd[1], Oh, Ow) Bt = Bt.reshape(N, dd[1], Oh*Ow) At = unfold(layer.activations, (Kh, Kw)) assert At.shape == (N, dd[0] * Kh * Kw, Oh*Ow) grad_unf = torch.einsum('ijk,ilk->ijl', Bt, At) assert grad_unf.shape == (N, dd[1], dd[0] * Kh * Kw) grads = grad_unf.reshape((N, dd[1], dd[0], Kh, Kw)) u.check_equal(grads.mean(dim=0), layer.weight.grad) # compute per-example gradients using autograd, compare against manual computation for i in range(N): u.clear_backprops(model) output = model(X[i:i + 1, ...]) loss = loss_fn(output) loss.backward() u.check_equal(grads[i], layer.weight.grad) u.check_equal(grads_bias[i], layer.bias.grad)
if len(sys.argv) > 1 and sys.argv[1] == 'maketest': print("Generating test data.") costs = [] for i in range(10): cost0, _ = sess.run([cost, train_op]) costs.append(cost0) open("data/train_losses.csv", "w").write(str(costs)) elif len(sys.argv) > 1 and sys.argv[1] == 'test': print("Running self test.") costs = [] for i in range(10): cost0, _ = sess.run([cost, train_op]) costs.append(cost0) u.check_equal(costs, eval(open("data/train_losses.csv").read()), rtol=1e-3, atol=1e-5) print("Test passed") else: print("Running training.") do_images = True u.reset_time() old_cost = sess.run(cost) old_i = 0 frame_count = 0 if do_images: os.system("rm pics/weights*.png") for i in range(1000): cost0, _ = sess.run([cost, train_op]) if i % 100 == 0:
def test_explicit_hessian(): """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf """ torch.set_default_dtype(torch.float64) A = torch.tensor([[-1., 4], [3, 0]]) B = torch.tensor([[-4., 3], [2, 6]]) X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True) Y = B.t() @ X @ A u.check_equal(Y, [[-52, 64], [-81, -108]]) loss = torch.sum(Y * Y) / 2 hess0 = u.hessian(loss, X).reshape([4, 4]) hess1 = u.Kron(A @ A.t(), B @ B.t()) u.check_equal(loss, 12512.5) # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas # autograd_lib uses math convention and does column-vectorized. # Commuting order of Kronecker product switches between two representations u.check_equal(hess1.commute(), hess0) # Do a test using Linear layers instead of matrix multiplies model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False) model.layers[0].weight.data.copy_(X) # Transpose to match previous results, layers treat dim0 as batch dimension u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]]) # XA = (A'X0)' model.layers[1].weight.data.copy_(B.t()) u.check_equal(model(A.t()).t(), Y) Y = model(A.t()).t() # transpose to data-dimension=columns loss = torch.sum(Y * Y) / 2 loss.backward() u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]]) G = B @ Y @ A.t() u.check_equal(model.layers[0].weight.grad, G) u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t())) # compute newton step u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6])) # compute Newton step using factored representation autograd_lib.add_hooks(model) Y = model(A.t()) n = 2 loss = torch.sum(Y * Y) / 2 autograd_lib.backprop_hess(Y, hess_type='LeastSquares') autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum') param = model.layers[0].weight hess2 = param.hess_kron print(hess2) u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]]) # Gradient test model.zero_grad() loss.backward() u.check_close(u.vec(G).flatten(), u.Vec(param.grad)) # Newton step test # Method 0: PyTorch native autograd newton_step0 = param.grad.flatten() @ torch.pinverse(hess0) newton_step0 = newton_step0.reshape(param.shape) u.check_equal(newton_step0, [[-5, 0], [-2, -6]]) # Method 1: colummn major order ihess2 = hess2.pinv() u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]]) u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]]) u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6]) newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form() # Method2: row major order ihess2_rowmajor = ihess2.commute() newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad) newton_step2 = newton_step2.matrix_form() u.check_equal(newton_step0, newton_step1) u.check_equal(newton_step0, newton_step2)
B[i] = t(W[i+1]) @ B[i+1] sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) # manually created gradients dW = [None]*(n+1) for i in range(n+1): dW[i] = -(B[i] @ t(A[i]))/dsize # automatic gradients grads = tf.gradients(loss, W) for i in range(n+1): u.check_equal(dW[i].eval(), grads[i].eval()) # first backprop is special since it's right input of matmul op = grads[0].op assert op.op_def.name == 'MatMul' assert op.get_attr("transpose_a") == True assert op.get_attr("transpose_b") == False u.check_equal(op.inputs[0], W[1]) u.check_equal(op.inputs[1], -B[1]/dsize) for i in range(1, n+1): op = grads[i].op assert op.op_def.name == 'MatMul' assert op.get_attr("transpose_a") == False assert op.get_attr("transpose_b") == True u.check_equal(op.inputs[0], -B[i]/dsize)
def rotations2_newton_kfac(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # inverse Hessian blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: acov = A[i] @ t(A[j]) bcov = (Bn[i] @ t(Bn[j])) / dsize term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] elapsed_times = [] u.reset_time() for i in range(10): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
print("Growing learning rate to %.2f"%(lr0*growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") if len(sys.argv)>1 and sys.argv[1]=="test": # GPU losses are quite noisy, set rtol high u.check_equal(targets, losses[:len(targets)], rtol=1e-3) u.dump(losses, "%s_losses_%d.csv"%(prefix ,whitening_mode,)) u.dump(step_lengths, "%s_step_lengths_%d.csv"%(prefix, whitening_mode,)) u.dump(ratios, "%s_ratios_%d.csv"%(prefix, whitening_mode,)) u.dump(grad_norms, "%s_grad_norms_%d.csv"%(prefix, whitening_mode,)) u.dump(pre_grad_norms, "%s_pre_grad_norms_%d.csv"%(prefix, whitening_mode,)) u.dump(pre_grad_stable_norms, "%s_pre_grad_stable_norms_%d.csv"%(prefix, whitening_mode,)) u.dump(target_delta_list, "%s_target_delta_%d.csv"%(prefix, whitening_mode,)) u.dump(target_delta2_list, "%s_target_delta2_%d.csv"%(prefix, whitening_mode,)) u.dump(actual_delta_list, "%s_actual_delta_%d.csv"%(prefix, whitening_mode,)) u.summarize_time()
def rotations1_gradient_test(): # https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations1_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv') lr = tf.Variable(lr0, dtype=dtype) # Create B's B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv", delimiter=",") observed_losses = [] # from accompanying notebook # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def test_main(): parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--wandb', type=int, default=1, help='log to weights and biases') parser.add_argument('--autograd_check', type=int, default=0, help='autograd correctness checks') parser.add_argument('--logdir', type=str, default='/temp/runs/curv_train_tiny/run') parser.add_argument('--train_batch_size', type=int, default=100) parser.add_argument('--stats_batch_size', type=int, default=60000) parser.add_argument('--dataset_size', type=int, default=60000) parser.add_argument('--train_steps', type=int, default=100, help="this many train steps between stat collection") parser.add_argument('--stats_steps', type=int, default=1000000, help="total number of curvature stats collections") parser.add_argument('--nonlin', type=int, default=1, help="whether to add ReLU nonlinearity between layers") parser.add_argument('--method', type=str, choices=['gradient', 'newton'], default='gradient', help="descent method, newton or gradient") parser.add_argument('--layer', type=int, default=-1, help="restrict updates to this layer") parser.add_argument('--data_width', type=int, default=28) parser.add_argument('--targets_width', type=int, default=28) parser.add_argument('--lmb', type=float, default=1e-3) parser.add_argument( '--hess_samples', type=int, default=1, help='number of samples when sub-sampling outputs, 0 for exact hessian' ) parser.add_argument('--hess_kfac', type=int, default=0, help='whether to use KFAC approximation for hessian') parser.add_argument('--compute_rho', type=int, default=1, help='use expensive method to compute rho') parser.add_argument('--skip_stats', type=int, default=0, help='skip all stats collection') parser.add_argument('--full_batch', type=int, default=0, help='do stats on the whole dataset') parser.add_argument('--weight_decay', type=float, default=1e-4) #args = parser.parse_args() args = AttrDict() args.lmb = 1e-3 args.compute_rho = 1 args.weight_decay = 1e-4 args.method = 'gradient' args.logdir = '/tmp' args.data_width = 2 args.targets_width = 2 args.train_batch_size = 10 args.full_batch = False args.skip_stats = False args.autograd_check = False u.seed_random(1) logdir = u.create_local_logdir(args.logdir) run_name = os.path.basename(logdir) #gl.event_writer = SummaryWriter(logdir) gl.event_writer = u.NoOp() # print(f"Logging to {run_name}") # small values for debugging # loss_type = 'LeastSquares' loss_type = 'CrossEntropy' args.wandb = 0 args.stats_steps = 10 args.train_steps = 10 args.stats_batch_size = 10 args.data_width = 2 args.targets_width = 2 args.nonlin = False d1 = args.data_width**2 d2 = 2 d3 = args.targets_width**2 d1 = args.data_width**2 assert args.data_width == args.targets_width o = d1 n = args.stats_batch_size d = [d1, 30, 30, 30, 20, 30, 30, 30, d1] if loss_type == 'CrossEntropy': d3 = 10 o = d3 n = args.stats_batch_size d = [d1, d2, d3] dsize = max(args.train_batch_size, args.stats_batch_size) + 1 model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin) model = model.to(gl.device) try: # os.environ['WANDB_SILENT'] = 'true' if args.wandb: wandb.init(project='curv_train_tiny', name=run_name) wandb.tensorboard.patch(tensorboardX=False) wandb.config['train_batch'] = args.train_batch_size wandb.config['stats_batch'] = args.stats_batch_size wandb.config['method'] = args.method wandb.config['n'] = n except Exception as e: print(f"wandb crash with {e}") # optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9) optimizer = torch.optim.Adam( model.parameters(), lr=0.03) # make 10x smaller for least-squares loss dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True) train_loader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) train_iter = u.infinite_iter(train_loader) stats_iter = None if not args.full_batch: stats_loader = torch.utils.data.DataLoader( dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True) stats_iter = u.infinite_iter(stats_loader) test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True) test_iter = u.infinite_iter(test_loader) if loss_type == 'LeastSquares': loss_fn = u.least_squares elif loss_type == 'CrossEntropy': loss_fn = nn.CrossEntropyLoss() autograd_lib.add_hooks(model) gl.token_count = 0 last_outer = 0 val_losses = [] for step in range(args.stats_steps): if last_outer: u.log_scalars( {"time/outer": 1000 * (time.perf_counter() - last_outer)}) last_outer = time.perf_counter() with u.timeit("val_loss"): test_data, test_targets = next(test_iter) test_output = model(test_data) val_loss = loss_fn(test_output, test_targets) # print("val_loss", val_loss.item()) val_losses.append(val_loss.item()) u.log_scalar(val_loss=val_loss.item()) # compute stats if args.full_batch: data, targets = dataset.data, dataset.targets else: data, targets = next(stats_iter) # Capture Hessian and gradient stats autograd_lib.enable_hooks() autograd_lib.clear_backprops(model) autograd_lib.clear_hess_backprops(model) with u.timeit("backprop_g"): output = model(data) loss = loss_fn(output, targets) loss.backward(retain_graph=True) with u.timeit("backprop_H"): autograd_lib.backprop_hess(output, hess_type=loss_type) autograd_lib.disable_hooks() # TODO(y): use remove_hooks with u.timeit("compute_grad1"): autograd_lib.compute_grad1(model) with u.timeit("compute_hess"): autograd_lib.compute_hess(model) for (i, layer) in enumerate(model.layers): # input/output layers are unreasonably expensive if not using Kronecker factoring if d[i] > 50 or d[i + 1] > 50: print( f'layer {i} is too big ({d[i], d[i + 1]}), skipping stats') continue if args.skip_stats: continue s = AttrDefault(str, {}) # dictionary-like object for layer stats ############################# # Gradient stats ############################# A_t = layer.activations assert A_t.shape == (n, d[i]) # add factor of n because backprop takes loss averaged over batch, while we need per-example loss B_t = layer.backprops_list[0] * n assert B_t.shape == (n, d[i + 1]) with u.timeit(f"khatri_g-{i}"): G = u.khatri_rao_t(B_t, A_t) # batch loss Jacobian assert G.shape == (n, d[i] * d[i + 1]) g = G.sum(dim=0, keepdim=True) / n # average gradient assert g.shape == (1, d[i] * d[i + 1]) u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1) if args.autograd_check: u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad) u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad) s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel( ) # proportion of activations that are zero s.mean_activation = torch.mean(A_t) s.mean_backprop = torch.mean(B_t) # empirical Fisher with u.timeit(f'sigma-{i}'): efisher = G.t() @ G / n sigma = efisher - g.t() @ g s.sigma_l2 = u.sym_l2_norm(sigma) s.sigma_erank = torch.trace(sigma) / s.sigma_l2 lambda_regularizer = args.lmb * torch.eye(d[i + 1] * d[i]).to( gl.device) H = layer.weight.hess with u.timeit(f"invH-{i}"): invH = torch.cholesky_inverse(H + lambda_regularizer) with u.timeit(f"H_l2-{i}"): s.H_l2 = u.sym_l2_norm(H) s.iH_l2 = u.sym_l2_norm(invH) with u.timeit(f"norms-{i}"): s.H_fro = H.flatten().norm() s.iH_fro = invH.flatten().norm() s.grad_fro = g.flatten().norm() s.param_fro = layer.weight.data.flatten().norm() u.nan_check(H) if args.autograd_check: model.zero_grad() output = model(data) loss = loss_fn(output, targets) H_autograd = u.hessian(loss, layer.weight) H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1]) u.check_close(H, H_autograd) # u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}') def loss_direction(dd: torch.Tensor, eps): """loss improvement if we take step eps in direction dd""" return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps**2 * dd @ H @ dd.t()) def curv_direction(dd: torch.Tensor): """Curvature in direction dd""" return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm()**2)) with u.timeit(f"pinvH-{i}"): pinvH = H.pinverse() with u.timeit(f'curv-{i}'): s.grad_curv = curv_direction(g) ndir = g @ pinvH # newton direction s.newton_curv = curv_direction(ndir) setattr(layer.weight, 'pre', pinvH) # save Newton preconditioner s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999 s.step_max = 2 / s.H_l2 s.step_min = torch.tensor(2) / torch.trace(H) s.newton_fro = ndir.flatten().norm( ) # frobenius norm of Newton update s.regret_newton = u.to_python_scalar( g @ pinvH @ g.t() / 2) # replace with "quadratic_form" s.regret_gradient = loss_direction(g, s.step_openai) with u.timeit(f'rho-{i}'): p_sigma = u.lyapunov_spectral(H, sigma) discrepancy = torch.max(abs(p_sigma - p_sigma.t()) / p_sigma) s.psigma_erank = u.sym_erank(p_sigma) s.rho = H.shape[0] / s.psigma_erank with u.timeit(f"batch-{i}"): s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t()) s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2 / n # Faster approaches for noise variance computation # s.noise_variance = torch.trace(H.inverse() @ sigma) # try: # # this fails with singular sigma # s.noise_variance = torch.trace(torch.solve(sigma, H)[0]) # # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0]) # pass # except RuntimeError as _: s.noise_variance_pinv = torch.trace(pinvH @ sigma) s.H_erank = torch.trace(H) / s.H_l2 s.batch_jain_simple = 1 + s.H_erank s.batch_jain_full = 1 + s.rho * s.H_erank u.log_scalars(u.nest_stats(layer.name, s)) # gradient steps with u.timeit('inner'): for i in range(args.train_steps): optimizer.zero_grad() data, targets = next(train_iter) model.zero_grad() output = model(data) loss = loss_fn(output, targets) loss.backward() # u.log_scalar(train_loss=loss.item()) if args.method != 'newton': optimizer.step() if args.weight_decay: for group in optimizer.param_groups: for param in group['params']: param.data.mul_(1 - args.weight_decay) else: for (layer_idx, layer) in enumerate(model.layers): param: torch.nn.Parameter = layer.weight param_data: torch.Tensor = param.data param_data.copy_(param_data - 0.1 * param.grad) if layer_idx != 1: # only update 1 layer with Newton, unstable otherwise continue u.nan_check(layer.weight.pre) u.nan_check(param.grad.flatten()) u.nan_check( u.v2r(param.grad.flatten()) @ layer.weight.pre) param_new_flat = u.v2r(param_data.flatten()) - u.v2r( param.grad.flatten()) @ layer.weight.pre u.nan_check(param_new_flat) param_data.copy_( param_new_flat.reshape(param_data.shape)) gl.token_count += data.shape[0] gl.event_writer.close() assert val_losses[0] > 2.4 # 2.4828238487243652 assert val_losses[-1] < 2.25 # 2.20609712600708
def rotations2_newton_bd(): # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # Create U's U = [list(range(n + 1)) for _ in range(n + 1)] for bottom in range(n + 1): for top in range(n + 1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom - 1)) for i in range(bottom, top + 1): prod = prod @ t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n + 1)) for _ in range(n + 1)] for i in range(1, n + 1): for j in range(1, n + 1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1]) else: term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j])) blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()