def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(1) tf.set_random_seed(1) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 train_images = np.asarray([[0, 1], [2, 3]]).astype(dtype) X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = np.asarray([[0., 1], [2, 3]]).astype(dtype) / 10 W1_0 = np.asarray([[4., 5], [6, 7]]).astype(dtype) / 10 W0f = u.flatten([W0_0, W1_0]) Wf = tf.constant(W0f) losses = [] for step in range(10): loss, output, grad, kfac_grad = loss_and_output_and_grad(Wf) loss0 = loss.numpy() print("Step %3d loss %10.9f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad u.record_time() u.summarize_time() target = 1.252017617 # without random sampling target = 1.256854534 # with random sampling but fixed seed target = 0.000359572 # with random sampling and linear target = 1.251557469 # with random sampling assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def main(): torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) images = torch.Tensor(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.cuda() data = Variable(images) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.encoder = nn.Linear(args.visible_size, args.hidden_size, bias=False) self.decoder = nn.Linear(args.hidden_size, args.visible_size, bias=False) def forward(self, input): x = input.view(-1, args.visible_size) x = self.encoder(x) x = F.sigmoid(x) x = self.decoder(x) x = F.sigmoid(x) return x.view_as(input) # initialize model and weights model = Net() params1, params2 = list(model.parameters()) params1.data = torch.Tensor( u.ng_init(args.visible_size, args.hidden_size).T) params2.data = torch.Tensor( u.ng_init(args.hidden_size, args.visible_size).T) if args.cuda: model.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=args.lr) for step in range(args.iters): optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) loss0 = loss.data[0] loss.backward() optimizer.step() print("Step %3d loss %6.5f" % (step, loss0)) u.record_time() u.summarize_time()
def do_run(train_op): sess = setup_session() observed_losses = [] u.reset_time() for i in range(do_run_iters): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op) u.record_time() u.summarize_time() return observed_losses
def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] print("Step %3d loss %6.5f msec %6.3f" % (step, loss0, u.last_time())) step += 1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss
def closure(): global step, final_loss optimizer.zero_grad() output = model(data) loss = F.mse_loss(output, data) if verbose: loss0 = loss.data[0] times.append(u.last_time()) print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time())) step+=1 if step == iters: final_loss = loss.data[0] loss.backward() u.record_time() return loss
def benchmark_execute(dims, iters, dtype): A = tf.random_uniform((dims, dims), dtype=dtype) B = tf.random_uniform((dims, dims), dtype=dtype) prods = [] for i in range(iters): prods.append(u.khatri_rao(A, B)) elapsed_times = [] sess = tf.Session() elapsed_times = [] u.reset_time() for i in range(10): time0 = time.time() sess.run(tf.group(*prods)) elapsed_times.append(time.time() - time0) u.record_time() u.summarize_time()
def benchmark_execute(dims, iters, dtype): A = tf.random_uniform((dims, dims), dtype=dtype) B = tf.random_uniform((dims, dims), dtype=dtype) prods = [] for i in range(iters): prods.append(u.khatri_rao(A,B)) elapsed_times = [] sess = tf.Session() elapsed_times = [] u.reset_time() for i in range(10): time0 = time.time() sess.run(tf.group(*prods)) elapsed_times.append(time.time()-time0) u.record_time() u.summarize_time()
def complex_train_test(): np.random.seed(0) do_images = True train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte') dsize = 10000 patches = train_images[:, :dsize] fs = [dsize, 28 * 28, 196, 28 * 28] cost, train_op = cost_and_grad(fs=fs, X0=patches, lambda_=3e-3, rho=0.1, beta=3, lr=0.1) sess = tf.get_default_session() u.reset_time() old_cost = sess.run(cost) old_i = 0 frame_count = 0 costs = [] for i in range(2000): cost0, _ = sess.run([cost, train_op]) costs.append(cost0) if i % 100 == 0: print(cost0) # filters are transposed in visualization if ((old_cost - cost0) / old_cost > 0.05 or i - old_i > 50) and do_images: Wf_ = sess.run("Wf_var/read:0") W1_ = u.unflatten_np(Wf_, fs[1:])[0] display_network.display_network(W1_.T, filename="pics/weights-%03d.png" % (frame_count, )) frame_count += 1 old_cost = cost0 old_i = i u.record_time() # u.dump(costs, "costs_adam.csv") u.dump(costs, "costs_adam_bn1.csv") u.summarize_time()
def main(): tf.set_random_seed(args.seed) np.random.seed(args.seed) images = tf.constant(u.get_mnist_images().T) images = images[:args.batch_size] if args.cuda: images = images.as_gpu_tensor() data = images if args.cuda: device='/gpu:0' else: device='' with tf.device(device): encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False, activation=tf.sigmoid) decoder = tf.layers.Dense(units=args.visible_size, use_bias=False, activation=tf.sigmoid) def loss_fn(inputs): predictions = decoder(encoder(inputs)) return tf.reduce_mean(tf.square(predictions-inputs)) value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn) # initialize weights loss_fn(data) params1 = encoder.weights[0] params2 = decoder.weights[0] params1.assign(u.ng_init(args.visible_size, args.hidden_size)) params2.assign(u.ng_init(args.hidden_size, args.visible_size)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr) for step in range(args.iters): value, grads_and_vars = value_and_gradients_fn(data) optimizer.apply_gradients(grads_and_vars) print("Step %3d loss %6.5f"%(step, value.numpy())) u.record_time() u.summarize_time()
def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(0) tf.set_random_seed(0) train_images = u.get_mnist_images() dsize = 1000 fs = [dsize, 28 * 28, 196, 28 * 28] # layer sizes lambda_ = 3e-3 def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] n = len(fs) - 2 X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) Wf = tf.constant(W0f) assert Wf.dtype == tf.float32 lr = tf.constant(0.2) losses = [] for step in range(10): loss, grad, kfac_grad = loss_and_grad(Wf) loss0 = loss.numpy() print("Step %d loss %.2f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad if step >= 4: assert loss < 17.6 u.record_time() u.summarize_time() assert losses[-1] < 0.8 assert losses[-1] > 0.78 assert 20e-3 < min(u.global_time_list) < 120e-3
def main(): global fs, X, n, f, dsize, lambda_ np.random.seed(args.seed) tf.set_random_seed(args.seed) if args.cuda: device = '/gpu:0' else: device = '/cpu:0' device_context = tf.device(device) device_context.__enter__() X = tf.constant(train_images[:, :dsize].astype(dtype)) W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0, W1_0]) Wf = tf.constant(W0f) assert Wf.dtype == tf.float32 lr = tf.constant(0.2) losses = [] for step in range(40): loss, grad, kfac_grad = loss_and_grad(Wf) loss0 = loss.numpy() print("Step %3d loss %10.9f" % (step, loss0)) losses.append(loss0) Wf -= lr * kfac_grad if step >= 4: assert loss < 17.6 u.record_time() u.summarize_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 30ms on 1080"
def main(): global forward_list, backward_list, DO_PRINT torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) data0 = np.array([[0., 1], [2, 3]]).astype(dtype) data = Variable(torch.from_numpy(data0)) class Net(nn.Module): def __init__(self): super(Net, self).__init__() W0 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10 W1 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10 self.W0 = nn.Parameter(torch.from_numpy(W0)) self.W1 = nn.Parameter(torch.from_numpy(W1)) def forward(self, input): x = input.view(-1, 2) x = nonlin(my_matmul(self.W0, x)) x = nonlin(my_matmul(self.W1, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=lr) losses = [] for step in range(10): optimizer.zero_grad() forward_list = [] backward_list = [] output = model(data) err = output-data loss = torch.sum(err*err)/2/dsize loss.backward(retain_graph=True) loss0 = loss.data[0] A = forward_list[:] B = backward_list[::-1] forward_list = [] backward_list = [] noise = torch.from_numpy(np.random.randn(*data.data.shape).astype(dtype)) synthetic_data = Variable(output.data+noise) err2 = output - synthetic_data loss2 = torch.sum(err2*err2)/2/dsize optimizer.zero_grad() backward_list = [] loss2.backward() B2 = backward_list[::-1] # compute whitened gradient pre_dW = [] n = len(A) assert len(B) == n assert len(B2) == n for i in range(n): covA = A[i] @ t(A[i])/dsize covB2 = B2[i]@t(B2[i])/dsize covB = B[i]@t(B[i])/dsize covA_inv = regularized_inverse(covA) whitened_A = regularized_inverse(covA)@A[i] whitened_B = regularized_inverse(covB2.data)@B[i].data pre_dW.append(whitened_B @ t(whitened_A)/dsize) params = list(model.parameters()) assert len(params) == len(pre_dW) for i in range(len(params)): params[i].data-=lr*pre_dW[i] print("Step %3d loss %10.9f"%(step, loss0)) u.record_time() target = 1.251557469 assert abs(loss0-target)<1e-9, abs(loss0-target) u.summarize_time()
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 # 64-bit doesn't help much, search for 64-bit in # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4 u.default_dtype = dtype machine_epsilon = np.finfo(dtype).eps # 1e-7 or 1e-16 train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte') dsize = 10000 patches = train_images[:, :dsize] fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0f = W_uniform(fs[2], fs[3]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: util.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") if purely_linear: # need lower LR without sigmoids lr = init_var(.02, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): if not purely_linear: return tf.sigmoid(x) else: return tf.identity(x) def d_sigmoid(y): if not purely_linear: return y * (1 - y) else: return 1 def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run # tf.assert always fails because of static assert # fail_node = tf.assert_equal(1, 0, message="too huge") fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] if i == 1 and not drop_sparsity: backprop += beta * d_kl(rho, rho_hat) backprop2 += beta * d_kl(rho, rho_hat) B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] if use_tikhonov: whitened_B2 = u.regularized_inverse2(vars_svd_B2[i], L=Lambda) @ B[i] else: whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i] pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) loss = reconstruction if not drop_l2: loss = loss + L2 if not drop_sparsity: loss = loss + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient pre_grad_stable_live = u.flatten( pre_dW_stable[1:]) # sqrt fisher preconditioned grad grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable") update_params_op = Wf.assign(Wf - lr * pre_grad).op update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) pre_grad_stable_norm = u.L2(pre_grad_stable) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) util.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) util.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): if whitening_mode > 1: vars_svd_A[2].update() if whitening_mode > 2: vars_svd_B2[2].update() if whitening_mode > 3: vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() # tf.get_default_graph().finalize() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) #sess = tf.Session(config=config) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] ratios = [] # actual loss decrease / expected decrease grad_norms = [] pre_grad_norms = [] # preconditioned grad norm squared pre_grad_stable_norms = [] # sqrt preconditioned grad norms squared target_delta_list = [] # predicted decrease linear approximation target_delta2_list = [] # predicted decrease quadratic appromation actual_delta_list = [] # actual decrease # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning if whitening_mode > 0: vars_svd_A[1].update() # compute t(delta).H.delta/2 def hessian_quadratic(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l]) total += decrement return (total / 2).eval() # compute t(delta).H^-1.delta/2 def hessian_quadratic_inv(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): invB2 = u.pseudo_inverse2(vars_svd_B2[l]) invA = u.pseudo_inverse2(vars_svd_A[l]) decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA) total += decrement return (total / 2).eval() # do line search, dump values as csv def line_search(initial_value, direction, step, num_steps): saved_val = tf.Variable(Wf) sess.run(saved_val.initializer) pl = tf.placeholder(dtype, shape=(), name="linesearch_p") assign_op = Wf.assign(initial_value - direction * step * pl) vals = [] for i in range(num_steps): sess.run(assign_op, feed_dict={pl: i}) vals.append(loss.eval()) sess.run(Wf.assign(saved_val)) # restore original value return vals for step in range(num_steps): update_covariances() if step % whiten_every_n_steps == 0: update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) save_params_op.run() # regular inverse becomes unstable when grad norm exceeds 1 stabilized_mode = grad_norm.eval() < 1 if stabilized_mode and not use_tikhonov: update_params_stable_op.run() else: update_params_op.run() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope target_delta_list.append(target_delta) # second order prediction of target delta # TODO: the sign is wrong, debug this # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28 if local_quadratics: x0 = Wf_copy.eval() x_opt = x0 - pre_grad.eval() # computes t(x)@H^-1 @(x)/2 y_opt = loss0 - hessian_quadratic_inv(grad) # computes t(x)@H @(x)/2 y_expected = hessian_quadratic(Wf - x_opt) + y_opt target_delta2 = y_expected - loss0 target_delta2_list.append(target_delta2) actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 actual_delta_list.append(actual_delta) if do_line_search: vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40) vals2 = line_search(Wf_copy, grad, lr / 100, 40) u.dump(vals1, "line1-%d" % (i, )) u.dump(vals2, "line2-%d" % (i, )) losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) grad_norms.append(grad_norm.eval()) pre_grad_norms.append(pre_grad_norm.eval()) pre_grad_stable_norms.append(pre_grad_stable_norm.eval()) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs( target_delta) > 1e-6 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print( "Slope optimality %.2f, shrinking learning rate to %.2f" % ( slope_ratio, lr0 * beta, )) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f" % (lr0 * growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") u.check_equal(targets, losses[:len(targets)], rtol=1e-1) u.summarize_time() print("Test passed")
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): gpu_options = tf.GPUOptions(allow_growth=False) sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options)) u.register_default_session(sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda) writer = u.BufferedWriter(outfn, 60) # get rid? start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step%args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time()-start_time print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0, vloss0)) writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0)) if args.method=='kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() grad.update() with kfac.read_lock(): grad_new.update() train_op.run() u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/'+release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets)
def lbfgs(opfunc, x, config, state, do_verbose): """port of lbfgs.lua, using TensorFlow eager mode. """ global final_loss, times maxIter = config.maxIter or 20 maxEval = config.maxEval or maxIter * 1.25 tolFun = config.tolFun or 1e-5 tolX = config.tolX or 1e-9 nCorrection = config.nCorrection or 100 lineSearch = config.lineSearch lineSearchOpts = config.lineSearchOptions learningRate = config.learningRate or 1 isverbose = config.verbose or False # verbose function if isverbose: verbose = verbose_func else: verbose = lambda x: None # evaluate initial f(x) and df/dx f, g = opfunc(x) f_hist = [f] currentFuncEval = 1 state.funcEval = state.funcEval + 1 p = g.shape[0] # check optimality of initial point tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: verbose("optimality condition below tolFun") return x, f_hist # optimize for a max of maxIter iterations nIter = 0 times = [] while nIter < maxIter: start_time = time.time() # keep track of nb of iterations nIter = nIter + 1 state.nIter = state.nIter + 1 ############################################################ ## compute gradient descent direction ############################################################ if state.nIter == 1: d = -g old_dirs = [] old_stps = [] Hdiag = 1 else: # do lbfgs update (update memory) y = g - g_old s = d * t ys = dot(y, s) if ys > 1e-10: # updating memory if len(old_dirs) == nCorrection: # shift history by one (limited-memory) del old_dirs[0] del old_stps[0] # store new direction/step old_dirs.append(s) old_stps.append(y) # update scale of initial Hessian approximation Hdiag = ys / dot(y, y) # compute the approximate (L-BFGS) inverse Hessian # multiplied by the gradient k = len(old_dirs) # need to be accessed element-by-element, so don't re-type tensor: ro = [0] * nCorrection for i in range(k): ro[i] = 1 / dot(old_stps[i], old_dirs[i]) # iteration in L-BFGS loop collapsed to use just one buffer # need to be accessed element-by-element, so don't re-type tensor: al = [0] * nCorrection q = -g for i in range(k - 1, -1, -1): al[i] = dot(old_dirs[i], q) * ro[i] q = q - al[i] * old_stps[i] # multiply by initial Hessian r = q * Hdiag for i in range(k): be_i = dot(old_stps[i], r) * ro[i] r += (al[i] - be_i) * old_dirs[i] d = r # final direction is in r/d (same object) g_old = g f_old = f ############################################################ ## compute step length ############################################################ # directional derivative gtd = dot(g, d) # check that progress can be made along that direction if gtd > -tolX: verbose("Can not make progress along direction.") break # reset initial guess for step size if state.nIter == 1: tmp1 = tf.abs(g) t = min(1, 1 / tf.reduce_sum(tmp1)) else: t = learningRate # optional line search: user function lsFuncEval = 0 if lineSearch and isinstance(lineSearch) == types.FunctionType: # perform line search, using user function f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd, lineSearchOpts) f_hist.append(f) else: # no line search, simply move with fixed-step x += t * d if nIter != maxIter: # re-evaluate function only if not in last iteration # the reason we do this: in a stochastic setting, # no use to re-evaluate that function here f, g = opfunc(x) lsFuncEval = 1 f_hist.append(f) # update func eval currentFuncEval = currentFuncEval + lsFuncEval state.funcEval = state.funcEval + lsFuncEval ############################################################ ## check conditions ############################################################ if nIter == maxIter: break if currentFuncEval >= maxEval: # max nb of function evals verbose('max nb of function evals') break tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: # check optimality verbose('optimality condition below tolFun') break tmp1 = tf.abs(d * t) if tf.reduce_sum(tmp1) <= tolX: # step size below tolX verbose('step size below tolX') break if tf.abs(f - f_old) < tolX: # function value changing less than tolX verbose('function value changing less than tolX' + str(tf.abs(f - f_old))) break if do_verbose: print("Step %3d loss %6.5f msec %6.3f" % (nIter, f.numpy(), u.last_time())) u.record_time() times.append(u.last_time()) if nIter == maxIter - 1: final_loss = f.numpy() # save state state.old_dirs = old_dirs state.old_stps = old_stps state.Hdiag = Hdiag state.g_old = g_old state.f_old = f_old state.t = t state.d = d return x, f_hist, currentFuncEval
def main(): global mode torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # feature sizes fs = [dsize, 28 * 28, 196, 28 * 28] # number of layers n = len(fs) - 2 matmul = kfac_matmul class Net(nn.Module): def __init__(self): super(Net, self).__init__() # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10 # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10 # self.W1 = nn.Parameter(torch.from_numpy(W1)) # self.W2 = nn.Parameter(torch.from_numpy(W2)) for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=lr) noise = torch.Tensor(*data.data.shape).type(torch_dtype) covA_inv_saved = [None] * n for step in range(10): mode = 'standard' output = model(data) mode = 'capture' optimizer.zero_grad() del forward[:] del backward[:] del forward_inv[:] del backward_inv[:] noise.normal_() output_hat = Variable(output.data + noise) output = model(data) err_hat = output_hat - output loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize loss_hat.backward(retain_graph=True) backward.reverse() forward.reverse() assert len(backward) == n assert len(forward) == n A = forward[:] B = backward[:] # compute inverses for i in range(n): # first layer doesn't change so only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(A[i] @ t(A[i]) / dsize) covA_inv_saved[i] = covA_inv forward_inv.append(covA_inv) covB_inv = regularized_inverse(B[i] @ t(B[i]) / dsize) backward_inv.append(covB_inv) mode = 'kfac' optimizer.zero_grad() err = output - data loss = torch.sum(err * err) / 2 / dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy() print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() if args.cuda: target = 2.337120533 else: target = 2.335612774 u.summarize_time() assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def main(): # global forward, backward, DO_PRINT global mode, covA_inv, covB_inv torch.manual_seed(args.seed) np.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # feature sizes fs = [args.batch_size, 28 * 28, 196, 28 * 28] # number of layers n = len(fs) - 2 # todo, move to more elegant backprop matmul = kfac_matmul class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(784, -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() optimizer = optim.SGD(model.parameters(), lr=lr) losses = [] covA = [None] * n covA_inv = [None] * n covB_inv = [None] * n noise = torch.Tensor(*data.data.shape).type(torch_dtype) # TODO: # only do 2 passes like in eager mode # integrate with optimizer/same results # scale to deep autoencoder for step in range(10): optimizer.zero_grad() del forward[:] del backward[:] output = model(data) err = output - data loss = torch.sum(err * err) / 2 / dsize loss.backward(retain_graph=True) backward.reverse() loss0 = loss.data[0] A = forward[:] B = backward[:] assert len(B) == n del forward[:] del backward[:] noise.normal_() synthetic_data = Variable(output.data + noise) err2 = output - synthetic_data loss2 = torch.sum(err2 * err2) / 2 / dsize optimizer.zero_grad() loss2.backward() B2 = backward[::-1] assert len(B2) == n # mode = 'kfac' # compute whitened gradient pre_dW = [] for i in range(n): # only compute first activation once if i > 0: covA[i] = A[i] @ t(A[i]) / dsize covA_inv[i] = regularized_inverse(covA[i]) else: if covA[i] is None: covA[i] = A[i] @ t(A[i]) / dsize covA_inv[i] = regularized_inverse(covA[i]) # else: covB2 = B2[i] @ t(B2[i]) / dsize covB = B[i] @ t(B[i]) / dsize # todo: remove covB_inv[i] = regularized_inverse(covB2.data) whitened_A = covA_inv[i] @ A[i] whitened_B = covB_inv[i] @ B[i].data pre_dW.append(whitened_B @ t(whitened_A) / dsize) params = list(model.parameters()) assert len(params) == len(pre_dW) for i in range(len(params)): params[i].data -= lr * pre_dW[i] print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() loss0 = loss.data.cpu().numpy() #[0] target = 2.360062122 if 'Apple' in sys.version: target = 2.360126972 target = 2.335654736 # after changing to torch.randn if args.cuda: target = 2.337174654 target = 2.337215662 # switching to numpy inverse u.summarize_time() assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def kfac_optimizer(model_creator): stats_batch_size = 10000 main_batch_size = 10000 stats_model, loss, labels = model_creator(stats_batch_size) # replace labels_node with synthetic labels main_model, _, _ = model_creator(main_batch_size) opt = tf.GradientDescentOptimizer(0.2) grads_and_vars = opt.compute_gradients(loss) trainable_vars = tf.trainable_variables() # create SVD and preconditioning variables for matmul vars for var in trainable_vars: if var not in matmul_registry: continue dW = u.extract_grad(grads_and_vars, var) A[var] = get_activations(var) B[var] = get_backprops(var) B2[var] = get_backprops2(var) # get backprops with synthetic labels dW[var] = B[var] @ t(A[var]) # todo: sort out dsize division cov_A[var] = init_var(A[var] @ t(A[var]) / dsize, "cov_A_%s" % (var.name, )) cov_B2[var] = init_var(B2[var] @ t(B2[var]) / dsize, "cov_B2_%s" % (var.name, )) vars_svd_A[var] = SvdWrapper(cov_A[var], "svd_A_%d" % (var.name, )) vars_svd_B2[var] = SvdWrapper(cov_B2[var], "svd_B2_%d" % (var.name, )) whitened_A = u.pseudo_inverse2(vars_svd_A[var]) @ A[var] whitened_B2 = u.pseudo_inverse2(vars_svd_B2[var]) @ B[var] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[var]) @ A[var] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[var]) @ B[var] pre_dW[var] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[var] = ( whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[var] = (B[var] @ t(A[var])) / dsize # create update params ops # new_grads_and_vars = [] # for grad, var in grads_and_vars: # if var in kfac_registry: # pre_A, pre_B = kfac_registry[var] # new_grad_live = pre_B @ grad @ t(pre_A) # new_grads_and_vars.append((new_grad, var)) # print("Preconditioning %s"%(var.name)) # else: # new_grads_and_vars.append((grad, var)) # train_op = opt.apply_gradients(new_grads_and_vars) # Each variable has an associated gradient, pre_gradient, variable save op def update_grad(): ops = [grad_update_ops[var] for var in trainable_vars] sess.run(ops) def update_pre_grad(): ops = [pre_grad_update_ops[var] for var in trainable_vars] sess.run(ops) def update_pre_grad2(): ops = [pre_grad2_update_ops[var] for var in trainable_vars] sess.run(ops) def save_params(): ops = [var_save_ops[var] for var in trainable_vars] sess.run(ops) for step in range(num_steps): update_covariances() if step % whitened_every_n_steps == 0: update_svds() update_grad() update_pre_grad() # perf todo: update one of these update_pre_grad2() # stable alternative lr0, loss0 = sess.run([lr, loss]) save_params() # when grad norm<1, Fisher is unstable, switch to Sqrt(Fisher) # TODO: switch to per-matrix normalization stabilized_mode = grad_norm.eval() < 1 if stabilized_mode: update_params2() else: update_params() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) u.record_time()
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 train_images = u.get_mnist_images() dsize = 10000 patches = train_images[:, :dsize].astype(dtype) fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0_0 = u.ng_init(fs[2], fs[3]) W1_0 = u.ng_init(fs[3], fs[2]) W0f = u.flatten([W0_0.flatten(), W1_0.flatten()]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: u.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): return tf.sigmoid(x) def d_sigmoid(y): return y * (1 - y) def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0]) cov_A[i] = init_var(cov_op, "cov_A%d" % (i, )) cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity( B2[i].shape[0]) cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ), do_inverses=True) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ), do_inverses=True) whitened_A = vars_svd_A[i].inv @ A[i] whitened_B = vars_svd_B2[i].inv @ B[i] pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) loss = reconstruction grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") update_params_op = Wf.assign(Wf - lr * pre_grad).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) u.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) u.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) u.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): vars_svd_A[2].update() vars_svd_B2[2].update() vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning vars_svd_A[1].update() for step in range(40): update_covariances() update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) update_params_op.run() advance_batch() losses.append(loss0) step_lengths.append(lr0) print("Step %d loss %.2f" % (step, loss0)) u.record_time() assert losses[-1] < 0.59 assert losses[-1] > 0.57 assert 20e-3 < min( u.global_time_list) < 50e-3, "Time should be 40ms on 1080" u.summarize_time() print("Test passed")
def train(optimizer='sgd', kfac=True, iters=10, verbose=True): global mode torch.manual_seed(1) np.random.seed(1) if args.cuda: torch.cuda.manual_seed(1) # feature sizes at each layer fs = [dsize, 28*28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28*28] n = len(fs) - 2 # number of matmuls class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n+1): W0 = u.ng_init(fs[i+1], fs[i]) setattr(self, 'W'+str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n+1): W = getattr(self, 'W'+str(i)) x = nonlin(kfac_matmul(W, x)) return x.view_as(input) model = Net() if args.cuda: model.cuda() data0 = u.get_mnist_images() data0 = data0[:, :dsize].astype(dtype) data = Variable(torch.from_numpy(data0)) if args.cuda: data = data.cuda() model.train() if optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr) elif optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) else: assert False, 'unknown optimizer '+optimizer noise = torch.Tensor(*data.data.shape).type(torch_dtype) covA_inv_saved = [None]*n losses = [] for step in range(10): mode = 'standard' output = model(data) mode = 'capture' optimizer.zero_grad() del As[:], Bs[:], As_inv[:], Bs_inv[:] noise.normal_() output_hat = Variable(output.data+noise) err_hat = output_hat - output loss_hat = torch.sum(err_hat*err_hat)/2/dsize loss_hat.backward(retain_graph=True) # compute inverses for i in range(n): # first layer activations don't change, only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(As[i] @ As[i].t()/dsize) covA_inv_saved[i] = covA_inv As_inv.append(covA_inv) covB = (Bs[i]@Bs[i].t())*dsize # alternative formula: slower but numerically better result # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize covB_inv = regularized_inverse(covB) Bs_inv.append(covB_inv) if kfac: mode = 'kfac' else: mode = 'standard' optimizer.zero_grad() err = output - data loss = torch.sum(err*err)/2/dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy()[0] losses.append(loss0) if verbose: print("Step %3d loss %10.9f"%(step, loss0)) u.record_time() return losses
def rotations2_newton_bd(): # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) tf.reset_default_graph() X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # Create U's U = [list(range(n + 1)) for _ in range(n + 1)] for bottom in range(n + 1): for top in range(n + 1): if bottom > top: prod = u.Identity(f(top)) else: prod = u.Identity(f(bottom - 1)) for i in range(bottom, top + 1): prod = prod @ t(W[i]) U[bottom][top] = prod # Block i, j gives hessian block between layer i and layer j blocks = [list(range(n + 1)) for _ in range(n + 1)] for i in range(1, n + 1): for j in range(1, n + 1): term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)), dtype=dtype) elif i < j: term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1]) else: term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j])) blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1)) # remove leftmost blocks (those are with respect to W[0] which is input) del blocks[0] for row in blocks: del row[0] ihess = u.concat_blocks(u.block_diagonal_inverse(blocks)) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def train(optimizer='sgd', nonlin=torch.sigmoid, kfac=True, iters=10, lr=0.2, newton_matrix='stochastic', eval_every_n_steps=1, print_interval=200): """Train on first 10k MNIST examples, evaluate on second 10k.""" u.reset_time() dsize = 10000 # model options dtype = np.float32 torch_dtype = 'torch.FloatTensor' use_cuda = torch.cuda.is_available() if use_cuda: torch_dtype = 'torch.cuda.FloatTensor' INVERSE_METHOD = 'numpy' # numpy, gpu As = [] Bs = [] As_inv = [] Bs_inv = [] mode = 'capture' # 'capture', 'kfac', 'standard' class KfacAddmm(Function): @staticmethod def _get_output(ctx, arg, inplace=False): if inplace: ctx.mark_dirty(arg) return arg else: return arg.new().resize_as_(arg) @staticmethod def forward(ctx, add_matrix, matrix1, matrix2, beta=1, alpha=1, inplace=False): ctx.save_for_backward(matrix1, matrix2) output = KfacAddmm._get_output(ctx, add_matrix, inplace=inplace) return torch.addmm(beta, add_matrix, alpha, matrix1, matrix2, out=output) @staticmethod def backward(ctx, grad_output): matrix1, matrix2 = ctx.saved_variables grad_matrix1 = grad_matrix2 = None if mode == 'capture': Bs.insert(0, grad_output.data) As.insert(0, matrix2.data) elif mode == 'kfac': B = grad_output.data A = matrix2.data kfac_A = As_inv.pop() @ A kfac_B = Bs_inv.pop() @ B grad_matrix1 = Variable(torch.mm(kfac_B, kfac_A.t())) elif mode == 'standard': grad_matrix1 = torch.mm(grad_output, matrix2.t()) else: assert False, 'unknown mode ' + mode if ctx.needs_input_grad[2]: grad_matrix2 = torch.mm(matrix1.t(), grad_output) return None, grad_matrix1, grad_matrix2, None, None, None def kfac_matmul(mat1, mat2): output = Variable(mat1.data.new(mat1.data.size(0), mat2.data.size(1))) return KfacAddmm.apply(output, mat1, mat2, 0, 1, True) torch.manual_seed(1) np.random.seed(1) if use_cuda: torch.cuda.manual_seed(1) # feature sizes at each layer fs = [dsize, 28 * 28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28 * 28] n = len(fs) - 2 # number of matmuls class Net(nn.Module): def __init__(self): super(Net, self).__init__() for i in range(1, n + 1): W0 = u.ng_init(fs[i + 1], fs[i]) setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0))) def forward(self, input): x = input.view(fs[1], -1) for i in range(1, n + 1): W = getattr(self, 'W' + str(i)) x = nonlin(kfac_matmul(W, x)) return x.view_as(input) model = Net() if use_cuda: model.cuda() images = u.get_mnist_images() train_data0 = images[:, :dsize].astype(dtype) train_data = Variable(torch.from_numpy(train_data0)) test_data0 = images[:, dsize:2 * dsize].astype(dtype) test_data = Variable(torch.from_numpy(test_data0)) if use_cuda: train_data = train_data.cuda() test_data = test_data.cuda() model.train() if optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr) elif optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) else: assert False, 'unknown optimizer ' + optimizer noise = torch.Tensor(*train_data.data.shape).type(torch_dtype) assert fs[-1] <= dsize padding = dsize - fs[-1] zero_mat = torch.zeros((fs[-1], padding)) frozen = torch.cat([torch.eye(fs[-1]), zero_mat], 1).type(torch_dtype) covA_inv_saved = [None] * n losses = [] vlosses = [] for step in range(iters): mode = 'standard' output = model(train_data) if kfac: mode = 'capture' optimizer.zero_grad() del As[:], Bs[:], As_inv[:], Bs_inv[:] if newton_matrix == 'stochastic': noise.normal_() err_add = noise elif newton_matrix == 'exact': err_add = frozen else: assert False, 'unknown method for newton matrix ' + newton_matrix output_hat = Variable(output.data + err_add) err_hat = output_hat - output loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize loss_hat.backward(retain_graph=True) # compute inverses for i in range(n): # first layer activations don't change, only compute once if i == 0 and covA_inv_saved[i] is not None: covA_inv = covA_inv_saved[i] else: covA_inv = regularized_inverse(As[i] @ As[i].t() / dsize) covA_inv_saved[i] = covA_inv As_inv.append(covA_inv) covB = (Bs[i] @ Bs[i].t()) * dsize # alternative formula: slower but numerically better result # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize covB_inv = regularized_inverse(covB) Bs_inv.append(covB_inv) mode = 'kfac' else: mode = 'standard' if step % eval_every_n_steps == 0: old_mode = mode mode = 'standard' test_output = model(test_data) test_err = test_data - test_output test_loss = torch.sum(test_err * test_err) / 2 / dsize vloss0 = test_loss.data.cpu().numpy()[0] vlosses.append(vloss0) mode = old_mode optimizer.zero_grad() err = output - train_data loss = torch.sum(err * err) / 2 / dsize loss.backward() optimizer.step() loss0 = loss.data.cpu().numpy()[0] losses.append(loss0) if step % print_interval == 0: print("Step %3d loss %10.9f" % (step, loss0)) u.record_time() return losses, vlosses
def rotations2_newton_kfac(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) for (numpy_W, tf_W) in zip(W0s, W): u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape)) # Create A's # A[1] == X A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) assert W[0].get_shape() == X0.shape assert A[n + 1].get_shape() == X0.shape assert A[1].get_shape() == X0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.1, dtype=dtype, name="learning_rate") # Create B's B = [0] * (n + 1) B[n] = -err / dsize Bn = [0] * (n + 1) # Newton-modified backprop Bn[n] = u.Identity(f(n)) for i in range(n - 1, -1, -1): B[i] = t(W[i + 1]) @ B[i + 1] Bn[i] = t(W[i + 1]) @ Bn[i + 1] # inverse Hessian blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): # reuse Hess tensor calculation in order to get off-diag block sizes dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize if i == j: acov = A[i] @ t(A[j]) bcov = (Bn[i] @ t(Bn[j])) / dsize term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ihess = u.concat_blocks(iblocks) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) # create dW's dW = [0] * (n + 1) for i in range(n + 1): dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0) Wf_new = Wf - lr * ihess @ dWf train_op1 = Wf_copy.assign(Wf_new) train_op2 = Wf.assign(Wf_copy) observed_losses = [] elapsed_times = [] u.reset_time() for i in range(10): loss0 = sess.run([loss])[0] print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def main(): np.random.seed(args.seed) tf.set_random_seed(args.seed) logger = u.TensorboardLogger(args.run) with u.timeit("init/session"): rewrite_options = None try: from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) except: pass optimizer_options = tf.OptimizerOptions( opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) gpu_options = tf.GPUOptions(allow_growth=False) config = tf.ConfigProto(graph_options=graph_options, gpu_options=gpu_options, log_device_placement=False) sess = tf.InteractiveSession(config=config) u.register_default_session( sess) # since default session is Thread-local with u.timeit("init/model_init"): model = model_creator(args.batch_size, name="main") model.initialize_global_vars(verbose=True) model.initialize_local_vars() kfac_lib.numeric_inverse = args.numeric_inverse with u.timeit("init/kfac_init"): kfac = Kfac(model_creator, args.kfac_batch_size) kfac.model.initialize_global_vars(verbose=False) kfac.model.initialize_local_vars() kfac.Lambda.set(args.Lambda) kfac.reset() # resets optimization variables (not model variables) if args.mode != 'run': opt = tf.train.AdamOptimizer(0.001) else: opt = tf.train.AdamOptimizer(args.lr) grads_and_vars = opt.compute_gradients(model.loss, var_list=model.trainable_vars) grad = IndexedGrad.from_grads_and_vars(grads_and_vars) grad_new = kfac.correct(grad) with u.capture_vars() as adam_vars: train_op = opt.apply_gradients(grad_new.to_grads_and_vars()) with u.timeit("init/adam"): sessrun([v.initializer for v in adam_vars]) losses = [] u.record_time() start_time = time.time() vloss0 = 0 # todo, unify the two data outputs outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda) start_time = time.time() if args.extra_kfac_batch_advance: kfac.model.advance_batch() # advance kfac batch if args.kfac_async: kfac.start_stats_runners() for step in range(args.num_steps): if args.validate_every_n and step % args.validate_every_n == 0: loss0, vloss0 = sessrun([model.loss, model.vloss]) else: loss0, = sessrun([model.loss]) losses.append(loss0) # TODO: remove this logger('loss/loss', loss0, 'loss/vloss', vloss0) elapsed = time.time() - start_time start_time = time.time() print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" % (elapsed * 1e3, step, loss0, vloss0)) if args.method == 'kfac' and not args.kfac_async: kfac.model.advance_batch() kfac.update_stats() with u.timeit("train"): model.advance_batch() with u.timeit("grad.update"): grad.update() with kfac.read_lock(): grad_new.update() u.run(train_op) u.record_time() logger.next_step() # TODO: use u.global_runs_dir # TODO: get rid of u.timeit? with open('timelines/graphdef.txt', 'w') as f: f.write(str(u.get_default_graph().as_graph_def())) u.summarize_time() if args.mode == 'record': u.dump_with_prompt(losses, release_test_fn) elif args.mode == 'test': targets = np.loadtxt('data/' + release_test_fn, delimiter=",") u.check_equal(losses, targets, rtol=1e-2) u.summarize_difference(losses, targets) assert u.last_time() < 800, "Expected 648 on GTX 1080"
def rotations2_natural_empirical(): tf.reset_default_graph() # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A[0] = u.Identity(dsize) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) lr = tf.Variable(0.000001, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B[n] = -err / dsize for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) del dW[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # inverse fisher preconditioner grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)], axis=0) fisher = grads @ tf.transpose(grads) / dsize ifisher = u.pseudo_inverse(fisher) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(10): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
if adaptive_step_frequency and adaptive_step and step>adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs(target_delta)>1e-6 and adaptive_step: print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio)) print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step>0 and i%50 == 0 and slope_ratio>0.90 and adaptive_step: print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f"%(lr0*growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") if len(sys.argv)>1 and sys.argv[1]=="test": # GPU losses are quite noisy, set rtol high u.check_equal(targets, losses[:len(targets)], rtol=1e-3)
def rotations2_natural_sampled_kfac(num_samples=1): tf.reset_default_graph() np.random.seed(0) tf.set_random_seed(0) # override kr with no-shape-inferring version def kr(A, B): return u.kronecker(A, B, do_shape_inference=False) X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",") Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",") W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=",")) fs = np.genfromtxt('data/large_rotations2_fs.csv', delimiter=",").astype(np.int32) n = len(fs) - 2 # number of layers def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = X0.shape[1] assert f(-1) == dsize # load W0f and do shape checks (can remove) W0s = u.unflatten_np(W0f, fs[1:]) # Wf doesn't have first layer (data matrix) W0s.insert(0, X0) Wf_holder = tf.placeholder(dtype, shape=W0f.shape) Wf = tf.Variable(Wf_holder, name="Wf") Wf_copy = tf.Variable(Wf_holder, name="Wf_copy") init_dict = {Wf_holder: W0f} # Create W's # initialize data + layers # W[0] is input matrix (X), W[n] is last matrix # A[1] has activations for W[1], equal to W[0]=X # A[n+1] has predictions # Create W's W = u.unflatten(Wf, fs[1:]) X = tf.constant(X0) Y = tf.constant(Y0) W.insert(0, X) A = [0] * (n + 2) A2 = [0] * (n + 2) # augmented forward props for natural gradient A[0] = u.Identity(dsize) A2[0] = u.Identity(dsize * num_samples) for i in range(n + 1): # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1] A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1)) if i == 0: # replicate dataset multiple times corresponding to number of samples A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1) else: A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1)) # input dimensions match assert W[0].get_shape() == X0.shape # output dimensions match assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape assert A[n + 1].get_shape() == Y0.shape err = Y - A[n + 1] loss = tf.reduce_sum(tf.square(err)) / (2 * dsize) # lower learning rate by 10x lr = tf.Variable(0.01, dtype=dtype) # create backprop matrices # B[i] has backprop for matrix i B = [0] * (n + 1) B2 = [0] * (n + 1) B[n] = -err / dsize B2[n] = tf.random_normal((f(n), dsize * num_samples), 0, 1, seed=0, dtype=dtype) for i in range(n - 1, -1, -1): B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i)) B2[i] = tf.matmul(tf.transpose(W[i + 1]), B2[i + 1], name="B2" + str(i)) # Create gradient update. Make copy of variables and split update into # two run calls. Using single set of variables will gives updates that # occasionally produce wrong results/NaN's because of data race dW = [0] * (n + 1) dW2 = [0] * (n + 1) updates1 = [0] * (n + 1) # compute updated value into Wcopy updates2 = [0] * (n + 1) # copy value back into W Wcopy = [0] * (n + 1) for i in range(n + 1): Wi_name = "Wcopy" + str(i) Wi_shape = (fs[i + 1], fs[i]) Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init") Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False) dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i)) dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i)) del dW[0] # get rid of W[0] update del dW2[0] # get rid of W[0] update # construct flattened gradient update vector dWf = tf.concat([vec(grad) for grad in dW], axis=0) # todo: divide both activations and backprops by size for cov calc # Kronecker factored covariance blocks iblocks = u.empty_grid(n + 1, n + 1) for i in range(1, n + 1): for j in range(1, n + 1): if i == j: acov = A2[i] @ t(A2[j]) / (dsize * num_samples) bcov = B2[i] @ t(B2[j]) / (dsize * num_samples) term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov)) else: term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)), dtype=dtype) iblocks[i][j] = term # remove leftmost blocks (those are with respect to W[0] which is input) del iblocks[0] for row in iblocks: del row[0] ifisher = u.concat_blocks(iblocks) Wf_copy = tf.Variable(tf.zeros(dtype=dtype, shape=Wf.shape, name="Wf_copy_init"), name="Wf_copy") new_val_matrix = Wf - lr * (ifisher @ dWf) train_op1 = Wf_copy.assign(new_val_matrix) train_op2 = Wf.assign(Wf_copy) sess = tf.Session() sess.run(tf.global_variables_initializer(), feed_dict=init_dict) observed_losses = [] u.reset_time() for i in range(20): loss0 = sess.run(loss) print(loss0) observed_losses.append(loss0) sess.run(train_op1) sess.run(train_op2) u.record_time() u.summarize_time() u.summarize_graph()
def lbfgs(opfunc, x, config, state, do_verbose): """port of lbfgs.lua, using TensorFlow eager mode. """ global final_loss, times maxIter = config.maxIter or 20 maxEval = config.maxEval or maxIter*1.25 tolFun = config.tolFun or 1e-5 tolX = config.tolX or 1e-9 nCorrection = config.nCorrection or 100 lineSearch = config.lineSearch lineSearchOpts = config.lineSearchOptions learningRate = config.learningRate or 1 isverbose = config.verbose or False # verbose function if isverbose: verbose = verbose_func else: verbose = lambda x: None # evaluate initial f(x) and df/dx f, g = opfunc(x) f_hist = [f] currentFuncEval = 1 state.funcEval = state.funcEval + 1 p = g.shape[0] # check optimality of initial point tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <= tolFun: verbose("optimality condition below tolFun") return x, f_hist # optimize for a max of maxIter iterations nIter = 0 times = [] while nIter < maxIter: start_time = time.time() # keep track of nb of iterations nIter = nIter + 1 state.nIter = state.nIter + 1 ############################################################ ## compute gradient descent direction ############################################################ if state.nIter == 1: d = -g old_dirs = [] old_stps = [] Hdiag = 1 else: # do lbfgs update (update memory) y = g - g_old s = d*t ys = dot(y, s) if ys > 1e-10: # updating memory if len(old_dirs) == nCorrection: # shift history by one (limited-memory) del old_dirs[0] del old_stps[0] # store new direction/step old_dirs.append(s) old_stps.append(y) # update scale of initial Hessian approximation Hdiag = ys/dot(y, y) # compute the approximate (L-BFGS) inverse Hessian # multiplied by the gradient k = len(old_dirs) # need to be accessed element-by-element, so don't re-type tensor: ro = [0]*nCorrection for i in range(k): ro[i] = 1/dot(old_stps[i], old_dirs[i]) # iteration in L-BFGS loop collapsed to use just one buffer # need to be accessed element-by-element, so don't re-type tensor: al = [0]*nCorrection q = -g for i in range(k-1, -1, -1): al[i] = dot(old_dirs[i], q) * ro[i] q = q - al[i]*old_stps[i] # multiply by initial Hessian r = q*Hdiag for i in range(k): be_i = dot(old_stps[i], r) * ro[i] r += (al[i]-be_i)*old_dirs[i] d = r # final direction is in r/d (same object) g_old = g f_old = f ############################################################ ## compute step length ############################################################ # directional derivative gtd = dot(g, d) # check that progress can be made along that direction if gtd > -tolX: verbose("Can not make progress along direction.") break # reset initial guess for step size if state.nIter == 1: tmp1 = tf.abs(g) t = min(1, 1/tf.reduce_sum(tmp1)) else: t = learningRate # optional line search: user function lsFuncEval = 0 if lineSearch and isinstance(lineSearch) == types.FunctionType: # perform line search, using user function f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts) f_hist.append(f) else: # no line search, simply move with fixed-step x += t*d if nIter != maxIter: # re-evaluate function only if not in last iteration # the reason we do this: in a stochastic setting, # no use to re-evaluate that function here f, g = opfunc(x) lsFuncEval = 1 f_hist.append(f) # update func eval currentFuncEval = currentFuncEval + lsFuncEval state.funcEval = state.funcEval + lsFuncEval ############################################################ ## check conditions ############################################################ if nIter == maxIter: break if currentFuncEval >= maxEval: # max nb of function evals verbose('max nb of function evals') break tmp1 = tf.abs(g) if tf.reduce_sum(tmp1) <=tolFun: # check optimality verbose('optimality condition below tolFun') break tmp1 = tf.abs(d*t) if tf.reduce_sum(tmp1) <= tolX: # step size below tolX verbose('step size below tolX') break if tf.abs(f-f_old) < tolX: # function value changing less than tolX verbose('function value changing less than tolX'+str(tf.abs(f-f_old))) break if do_verbose: print("Step %3d loss %6.5f msec %6.3f"%(nIter, f.numpy(), u.last_time())) u.record_time() times.append(u.last_time()) if nIter == maxIter - 1: final_loss = f.numpy() # save state state.old_dirs = old_dirs state.old_stps = old_stps state.Hdiag = Hdiag state.g_old = g_old state.f_old = f_old state.t = t state.d = d return x, f_hist, currentFuncEval
expected_slope = -grad2_norm_op.eval() # ratio of best possible slope to actual slope # don't divide by actual slope because that can be 0 slope_ratio = abs(actual_slope)/abs(expected_slope) costs.append(cost0) step_lengths.append(lr0) ratios.append(slope_ratio) if i%10 == 0: print("Learning rate: %f"% (lr0,)) print("Cost %.2f, expected decrease %.2f, actual decrease, %.2f ratio %.2f"%(cost0, expected_delta, actual_delta, slope_ratio)) # don't shrink learning rate once results are very close to minimum if slope_ratio < alpha and abs(target_delta)>1e-6: print("%.2f %.2f %.2f"%(cost0, cost1, slope_ratio)) print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,)) sess.run(lr_set, feed_dict={lr_p: lr0*beta}) else: # see if our learning rate got too conservative, and increase it if i>0 and i%10 == 0 and slope_ratio>0.99: print("%.2f %.2f %.2f"%(cost0, cost1, slope_ratio)) print("Growing learning rate to %.2f"%(lr0*growth_rate)) sess.run(lr_set, feed_dict={lr_p: lr0*growth_rate}) u.record_time() u.dump(step_lengths, "step_lengths_ada.csv") # u.dump(costs, "costs_ada.csv") # u.dump(ratios, "ratios_ada.csv")