def opt_step(): with tf.GradientTape() as g2nd: # second order derivative with tf.GradientTape() as g1st: # first order derivative cost = f() grads = g1st.gradient(cost, xyz) # gradient vs = [tf.random.normal(w.shape) for w in xyz] # a random vector hess_vs = g2nd.gradient(grads, xyz, vs) # Hessian-vector products [old.assign(new) for (old, new) in zip([L12, l3, U12, u3], psgd.update_precond_splu(L12, l3, U12, u3, vs, hess_vs, step=0.1))] pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads) [w.assign_sub(0.1*g) for (w, g) in zip(xyz, pre_grads)] # update parameters return cost
U12 = torch.cat([torch.eye(r), torch.zeros([r, num_para - r])], dim=1) u3 = torch.ones([num_para - r, 1]) # begin iteration here step_size = 0.02 grad_norm_clip_thr = 1.0 Loss = [] for num_iter in range(10000): x, y = get_batches( ) # calculate loss and gradient loss = train_criterion(Ws, x, y) grads = grad(loss, Ws, create_graph=True) Loss.append(loss.item()) # update preconditioners delta = [torch.randn(W.size()) for W in Ws] grad_delta = sum([torch.sum(g*d) for (g, d) in zip(grads, delta)]) hess_delta = grad(grad_delta, Ws) with torch.no_grad(): L12, l3, U12, u3 = psgd.update_precond_splu(L12, l3, U12, u3, delta, hess_delta) # update Ws pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads) grad_norm = torch.sqrt(sum([torch.sum(g*g) for g in pre_grads])) step_adjust = min(grad_norm_clip_thr/(grad_norm + 1.2e-38), 1.0) for i in range(len(Ws)): Ws[i] -= step_adjust*step_size*pre_grads[i] if num_iter % 100 == 0: print('training loss: {}'.format(Loss[-1])) plt.semilogy(Loss)
l3 = 0.1*np.ones([num_para - r, 1]) U12 = np.concatenate([np.eye(r), np.zeros([r, num_para - r])], axis=1) u3 = np.ones([num_para - r, 1]) splu_Loss = [] splu_Times = [] splu_Iter = np.linspace(1,5000,5000) splu_Iter.tolist() t0 = time.time() for num_iter in range(5000): loss, grads = trd_cost_grad(T, [x, y, z]) splu_Loss.append(loss) t1 = time.time() splu_Times.append(t1-t0) dx, dy, dz = sqrt_eps*np.random.randn(R, I), sqrt_eps*np.random.randn(R, J), sqrt_eps*np.random.randn(R, K) _, perturbed_grads = trd_cost_grad(T, [x + dx, y + dy, z + dz]) L12, l3, U12, u3 = psgd.update_precond_splu(L12, l3, U12, u3, [dx, dy, dz], [perturbed_grads[0] - grads[0], perturbed_grads[1] - grads[1], perturbed_grads[2] - grads[2]]) pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads) x -= 0.5*pre_grads[0] y -= 0.5*pre_grads[1] z -= 0.5*pre_grads[2] #plt.subplot(121) #plt.loglog(Loss) #plt.subplot(122) #plt.loglog(Times,Loss)
l3 = 0.1 * torch.ones(num_para - r, 1) # upper triangular matrix is [U1, U2; 0, diag(u3)]; U12 is [U1, U2] U12 = 0.1 * torch.cat([torch.eye(r), torch.zeros(r, num_para - r)], dim=1) u3 = 0.1 * torch.ones(num_para - r, 1) for _ in range(200): loss = f() f_values.append(loss.item()) grads = torch.autograd.grad(loss, xyz, create_graph=True) vs = [torch.randn_like(w) for w in xyz] Hvs = torch.autograd.grad(grads, xyz, vs) with torch.no_grad(): L12, l3, U12, u3 = psgd.update_precond_splu(L12, l3, U12, u3, vs, Hvs, step=0.1) pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads) [w.subtract_(0.1 * g) for (w, g) in zip(xyz, pre_grads)] elif demo_case == 'Kronecker_product_preconditioner': # # example 1 # Qs = [[0.1*torch.eye(R), torch.stack([torch.ones(I), torch.zeros(I)], dim=0)], # (dense, normalization) format # [0.1*torch.ones(1, R), torch.eye(J)], # (scaling, dense) format # [0.1*torch.ones(1, R), torch.stack([torch.ones(K), torch.zeros(K)], dim=0)],] # (scaling, normalization) format # example 2 Qs = [ [
u3 = tf.Variable(tf.ones([num_para - r, 1], dtype=dtype), trainable=False) train_loss = train_criterion(Ws) grads = tf.gradients(train_loss, Ws) precond_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads) grad_norm = tf.sqrt(tf.reduce_sum([tf.reduce_sum(g*g) for g in precond_grads])) step_size_adjust = tf.minimum(1.0, grad_norm_clip_thr/(grad_norm + 1.2e-38)) new_Ws = [W - (step_size_adjust*step_size)*g for (W, g) in zip(Ws, precond_grads)] update_Ws = [tf.assign(W, new_W) for (W, new_W) in zip(Ws, new_Ws)] delta_Ws = [tf.random_normal(W.shape, dtype=dtype) for W in Ws] grad_deltaw = tf.reduce_sum([tf.reduce_sum(g*v) for (g, v) in zip(grads, delta_Ws)]) # dot(grads, delta_Ws) hess_deltaw = tf.gradients(grad_deltaw, Ws) # Hessian * delta_Ws new_L12, new_l3, new_U12, new_u3 = psgd.update_precond_splu(L12, l3, U12, u3, delta_Ws, hess_deltaw) update_Q = [tf.assign(L12, new_L12), tf.assign(l3, new_l3), tf.assign(U12, new_U12), tf.assign(u3, new_u3)] test_loss = test_criterion(Ws) sess.run(tf.global_variables_initializer()) avg_train_loss = 0.0 TrainLoss = list() TestLoss = list() Time = list() for num_iter in range(20000): _train_inputs, _train_outputs = get_batches() t0 = time.time() _train_loss, _,_ = sess.run([train_loss, update_Ws, update_Q],