def cost1(theta, visible_size, hidden_size, lambda_, sparsity_param, beta, data0): """Construct sparse autoencoder loss, return tensor.""" W10 = theta[0:hidden_size * visible_size].reshape(hidden_size, visible_size, order='F') W20 = theta[hidden_size * visible_size:2 * hidden_size * visible_size].reshape(visible_size, hidden_size, order='F') b10 = theta[2 * hidden_size * visible_size:2 * hidden_size * visible_size + hidden_size] b20 = theta[2 * hidden_size * visible_size + hidden_size:] init_dict = {} def init_var(val, name, trainable=True): holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name + "_var", trainable=trainable) init_dict[holder] = val return var W1 = init_var(W10, "W1") W2 = init_var(W20, "W2") b1 = init_var(u.v2c_np(b10), "b1") b2 = init_var(u.v2c_np(b20), "b2") data = init_var(data0, "data", False) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # Number of training examples # todo Rename data to x m = data0.shape[1] a1 = data # Forward propagation z2 = tf.matmul(W1, a1) + b1 a2 = tf.sigmoid(z2) z3 = tf.matmul(W2, a2) + b2 a3 = tf.sigmoid(z3) # Sparsity rho_hat = tf.reduce_sum(a2, axis=1, keep_dims=True) / m rho = tf.constant(sparsity_param, dtype=dtype) # Cost function cost = tf.reduce_sum((a3 - a1) ** 2) / (2 * m) + \ (lambda_ / 2) * (tf.reduce_sum(W1 ** 2) + \ tf.reduce_sum(W2 ** 2)) + \ beta * tf.reduce_sum(KL_divergence_tf(rho, rho_hat)) return init_dict, cost
def sparse_autoencoder_cost_tf(theta, visible_size, hidden_size, lambda_, sparsity_param, beta, data): # TODO: get rid of b # def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] # fs = [10000, 28*28, 196, 28*28] # n = len(fs)-2 # W = [None]*n W1 = theta[0:hidden_size * visible_size].reshape(hidden_size, visible_size, order='F') W2 = theta[hidden_size * visible_size:2 * hidden_size * visible_size].reshape(visible_size, hidden_size, order='F') b1 = theta[2 * hidden_size * visible_size:2 * hidden_size * visible_size + hidden_size] b2 = theta[2 * hidden_size * visible_size + hidden_size:] init_dict = {} def init_var(val, name): holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name + "_var") init_dict[holder] = val return var W1_ = init_var(W1, "W1") W2_ = init_var(W2, "W2") b1_ = init_var(u.v2c_np(b1), "b1") b2_ = init_var(u.v2c_np(b2), "b2") data_ = init_var(data, "data") sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # Number of training examples m = data.shape[1] a1 = data a1_ = data_ # Forward propagation z2 = W1.dot(a1) + np.tile(b1, (m, 1)).transpose() z2_ = tf.matmul(W1_, a1_) + b1_ a2 = sigmoid(z2) a2_ = tf.sigmoid(z2_) z3 = W2.dot(a2) + np.tile(b2, (m, 1)).transpose() z3_ = tf.matmul(W2_, a2_) + b2_ a3 = sigmoid(z3) a3_ = tf.sigmoid(z3_) # Sparsity rho_hat = np.sum(a2, axis=1) / m rho_hat_ = tf.reduce_sum(a2_, axis=1, keep_dims=True) / m rho = np.tile(sparsity_param, hidden_size) # ValueError: Shape must be rank 1 but is rank 0 for 'Tile_2' (op: 'Tile') with input shapes: [], []. rho_ = tf.constant(sparsity_param, dtype=dtype) #tf.ones((hidden_size, 1), dtype=dtype)*sparsity_param u.check_equal(sess.run(a3_), a3) u.check_equal(sess.run(a2_), a2) u.check_equal(sess.run(a1_), a1) u.check_equal( tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_)).eval(), np.sum(KL_divergence(rho, rho_hat))) # Cost function cost = np.sum((a3 - a1) ** 2) / (2 * m) + \ (lambda_ / 2) * (np.sum(W1 ** 2) + np.sum(W2 ** 2)) + \ beta * np.sum(KL_divergence(rho, rho_hat)) cost_ = tf.reduce_sum((a3_ - a1_) ** 2) / (2 * m) + \ (lambda_ / 2) * (tf.reduce_sum(W1_ ** 2) + \ tf.reduce_sum(W2_ ** 2)) + \ beta * tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_)) return sess.run(cost_)
def simple_newton_kfac_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # inverse Hessian blocks iblocks = u.empty_grid(n+1, n+1) for i in range(1, n+1): for j in range(1, n+1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: acov = A[i] @ t(A[j]) bcov = Bn[i] @ t(Bn[j]) / dsize; term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j]=term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, # 6.88354*10^-33, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def relu_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_relu_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [4,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0, name="X0") Y = tf.constant(Y0, name="Y0") W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): if i == 0: A[i+1] = X else: A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1))) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.1, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = (-err/dsize)*u.relu_mask(A[n+1]) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] if i > 0: # there's no relu on first matrix B[i] = B[i]*u.relu_mask(A[i+1]) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv", delimiter= ",") observed_losses = [] # From accompanying notebook # {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892, # 0.00156137, 0.00153857, 0.00153051, 0.00152593} for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def simple_newton_bd_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(0.5, dtype=dtype, name="learning_rate") # Create B's B = [0]*(n+1) B[n] = -err/dsize Bn = [0]*(n+1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] Bn[i] = t(W[i+1]) @ Bn[i+1] # Create U's U = [list(range(n+1)) for _ in range(n+1)] for bottom in range(n+1): for top in range(n+1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom-1)) for i in range(bottom, top+1): prod = prod@t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n+1)) for _ in range(n+1)] for i in range(1, n+1): for j in range(1, n+1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize; if i == j: term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i+1][j-1]) else: term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j])) blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] #hess = u.concat_blocks(blocks) ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) # ihess = u.pseudo_inverse(hess) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, # 1.21455*10^-32, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def grad_update(new_val): copy_op = Wf_copy.assign(new_val) with tf.control_dependencies([copy_op]): train_op = Wf.assign(Wf_copy) return train_op if __name__ == '__main__': # Compare a set of algorithms on rotations problem X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter= ",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
def simple_gradient_test(): tf.reset_default_graph() X0 = np.genfromtxt('data/rotations_simple_X0.csv', delimiter= ",") Y0 = np.genfromtxt('data/rotations_simple_Y0.csv', delimiter= ",") W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv', delimiter= ",")) assert W0f.shape == (8, 1) fs = np.genfromtxt('data/rotations_simple_fs.csv', delimiter= ",").astype(np.int32) n = len(fs)-2 # number of layers u.check_equal(fs, [10,2,2,2]) def f(i): return fs[i+1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0]*(n+2) A[0] = u.Identity(dsize) for i in range(n+1): A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1)) assert W[0].get_shape() == X0.shape assert A[n+1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n+1] loss = tf.reduce_sum(tf.square(err))/(2*dsize) lr = tf.Variable(1.0, dtype=dtype) # Create B's B = [0]*(n+1) B[n] = -err/dsize for i in range(n-1, -1, -1): B[i] = t(W[i+1]) @ B[i+1] # create dW's dW = [0]*(n+1) for i in range(n+1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv", delimiter= ",") observed_losses = [] # from accompanying notebook # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361, # 0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772, for i in range(20): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)
def rotations2_natural_sampled_kfac(num_samples=1): tf.reset_default_graph() np.random.seed(0) tf.set_random_seed(0) # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A2 = [0] * (n + 2) # augmented forward props for natural gradient A[0] = u.Identity(dsize) A2[0] = u.Identity(dsize * num_samples) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) if i == 0: # replicate dataset multiple times corresponding to number of samples A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1) else: A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) # lower learning rate by 10x lr = tf.Variable(0.01, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B2 = [0] * (n + 1) B[n] = -err / dsize B2[n] = tf.random_normal((f(n), dsize * num_samples), 0, 1, seed=0, dtype=dtype) for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) B2[i] = tf.matmul(tf.transpose(W[i + 1]), B2[i + 1], name="B2" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) dW2 = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i)) del dW[0] # get rid of W[0] update del dW2[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # todo: divide both activations and backprops by size for cov calc # Kronecker factored covariance blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): if i == j: acov = A2[i] @ t(A2[j]) / (dsize * num_samples) bcov = B2[i] @ t(B2[j]) / (dsize * num_samples) term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ifisher = u.concat_blocks(iblocks) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_natural_empirical(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.000001, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # inverse fisher preconditioner grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)], axis=0) fisher = grads @ tf.transpose(grads) / dsize ifisher = u.pseudo_inverse(fisher) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(10): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_newton_kfac(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # inverse Hessian blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: acov = A[i] @ t(A[j]) bcov = (Bn[i] @ t(Bn[j])) / dsize term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] elapsed_times = [] u.reset_time() for i in range(10): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations2_newton_bd(): # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # Create U's U = [list(range(n + 1)) for _ in range(n + 1)] for bottom in range(n + 1): for top in range(n + 1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom - 1)) for i in range(bottom, top + 1): prod = prod @ t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n + 1)) for _ in range(n + 1)] for i in range(1, n + 1): for j in range(1, n + 1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1]) else: term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j])) blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def rotations1_gradient_test(): # https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations1_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv') lr = tf.Variable(lr0, dtype=dtype) # Create B's B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv", delimiter=",") observed_losses = [] # from accompanying notebook # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379, for i in range(10): observed_losses.append(sess.run([loss])[0]) sess.run(train_op1) sess.run(train_op2) u.check_equal(observed_losses, expected_losses)