コード例 #1
0
 def opt_step():
     with tf.GradientTape() as g2nd: # second order derivative
         with tf.GradientTape() as g1st: # first order derivative
             cost = f()
         grads = g1st.gradient(cost, xyz) # gradient
         vs = [tf.random.normal(w.shape) for w in xyz] # a random vector
     hess_vs = g2nd.gradient(grads, xyz, vs) # Hessian-vector products
     [old.assign(new) for (old, new) in zip([L12, l3, U12, u3], psgd.update_precond_splu(L12, l3, U12, u3, vs, hess_vs, step=0.1))]
     pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads)
     [w.assign_sub(0.1*g) for (w, g) in zip(xyz, pre_grads)] # update parameters
     return cost
コード例 #2
0
U12 = torch.cat([torch.eye(r), torch.zeros([r, num_para - r])], dim=1)
u3 = torch.ones([num_para - r, 1])
# begin iteration here
step_size = 0.02
grad_norm_clip_thr = 1.0
Loss = []
for num_iter in range(10000):
    x, y = get_batches( )
    
    # calculate loss and gradient
    loss = train_criterion(Ws, x, y)
    grads = grad(loss, Ws, create_graph=True)
    Loss.append(loss.item())
    
    # update preconditioners
    delta = [torch.randn(W.size()) for W in Ws]
    grad_delta = sum([torch.sum(g*d) for (g, d) in zip(grads, delta)])
    hess_delta = grad(grad_delta, Ws)
    with torch.no_grad():
        L12, l3, U12, u3 = psgd.update_precond_splu(L12, l3, U12, u3, delta, hess_delta) 
        # update Ws
        pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads)
        grad_norm = torch.sqrt(sum([torch.sum(g*g) for g in pre_grads]))
        step_adjust = min(grad_norm_clip_thr/(grad_norm + 1.2e-38), 1.0)
        for i in range(len(Ws)):
            Ws[i] -= step_adjust*step_size*pre_grads[i]
            
        if num_iter % 100 == 0:
            print('training loss: {}'.format(Loss[-1]))
    
plt.semilogy(Loss)
コード例 #3
0
ファイル: demo_psgd_splu.py プロジェクト: lixilinx/psgd_np
l3 = 0.1*np.ones([num_para - r, 1])
U12 = np.concatenate([np.eye(r), np.zeros([r, num_para - r])], axis=1)
u3 = np.ones([num_para - r, 1])
splu_Loss = []
splu_Times = []
splu_Iter = np.linspace(1,5000,5000)
splu_Iter.tolist()

t0 = time.time()
for num_iter in range(5000):
    loss, grads = trd_cost_grad(T, [x, y, z])
    splu_Loss.append(loss)
    t1 = time.time()
    splu_Times.append(t1-t0)
    dx, dy, dz = sqrt_eps*np.random.randn(R, I), sqrt_eps*np.random.randn(R, J), sqrt_eps*np.random.randn(R, K)
    _, perturbed_grads = trd_cost_grad(T, [x + dx, y + dy, z + dz])
    L12, l3, U12, u3 = psgd.update_precond_splu(L12, l3, U12, u3, [dx, dy, dz],                          
                                                [perturbed_grads[0] - grads[0],                              
                                                 perturbed_grads[1] - grads[1],                           
                                                 perturbed_grads[2] - grads[2]])
    pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads)
    x -= 0.5*pre_grads[0]
    y -= 0.5*pre_grads[1]
    z -= 0.5*pre_grads[2]

#plt.subplot(121)
#plt.loglog(Loss)
#plt.subplot(122)
#plt.loglog(Times,Loss)

コード例 #4
0
    l3 = 0.1 * torch.ones(num_para - r, 1)
    # upper triangular matrix is [U1, U2; 0, diag(u3)]; U12 is [U1, U2]
    U12 = 0.1 * torch.cat([torch.eye(r), torch.zeros(r, num_para - r)], dim=1)
    u3 = 0.1 * torch.ones(num_para - r, 1)

    for _ in range(200):
        loss = f()
        f_values.append(loss.item())
        grads = torch.autograd.grad(loss, xyz, create_graph=True)
        vs = [torch.randn_like(w) for w in xyz]
        Hvs = torch.autograd.grad(grads, xyz, vs)
        with torch.no_grad():
            L12, l3, U12, u3 = psgd.update_precond_splu(L12,
                                                        l3,
                                                        U12,
                                                        u3,
                                                        vs,
                                                        Hvs,
                                                        step=0.1)
            pre_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads)
            [w.subtract_(0.1 * g) for (w, g) in zip(xyz, pre_grads)]

elif demo_case == 'Kronecker_product_preconditioner':
    # # example 1
    # Qs = [[0.1*torch.eye(R), torch.stack([torch.ones(I), torch.zeros(I)], dim=0)], # (dense, normalization) format
    #       [0.1*torch.ones(1, R), torch.eye(J)], # (scaling, dense) format
    #       [0.1*torch.ones(1, R), torch.stack([torch.ones(K), torch.zeros(K)], dim=0)],] # (scaling, normalization) format

    # example 2
    Qs = [
        [
コード例 #5
0
 u3 = tf.Variable(tf.ones([num_para - r, 1], dtype=dtype), trainable=False)
     
 train_loss = train_criterion(Ws)
 grads = tf.gradients(train_loss, Ws)
 
 precond_grads = psgd.precond_grad_splu(L12, l3, U12, u3, grads)
 grad_norm = tf.sqrt(tf.reduce_sum([tf.reduce_sum(g*g) for g in precond_grads]))
 step_size_adjust = tf.minimum(1.0, grad_norm_clip_thr/(grad_norm + 1.2e-38))
 new_Ws = [W - (step_size_adjust*step_size)*g for (W, g) in zip(Ws, precond_grads)]
 update_Ws = [tf.assign(W, new_W) for (W, new_W) in zip(Ws, new_Ws)]
 
 delta_Ws = [tf.random_normal(W.shape, dtype=dtype) for W in Ws]
 grad_deltaw = tf.reduce_sum([tf.reduce_sum(g*v) for (g, v) in zip(grads, delta_Ws)]) # dot(grads, delta_Ws)
 hess_deltaw = tf.gradients(grad_deltaw, Ws) # Hessian * delta_Ws
 
 new_L12, new_l3, new_U12, new_u3 = psgd.update_precond_splu(L12, l3, U12, u3, delta_Ws, hess_deltaw)
 update_Q = [tf.assign(L12, new_L12), tf.assign(l3, new_l3),
             tf.assign(U12, new_U12), tf.assign(u3, new_u3)]
 
 test_loss = test_criterion(Ws)  
 
 sess.run(tf.global_variables_initializer())
 avg_train_loss = 0.0
 TrainLoss = list()
 TestLoss = list()
 Time = list()
 for num_iter in range(20000):    
     _train_inputs, _train_outputs = get_batches()
 
     t0 = time.time()
     _train_loss, _,_ = sess.run([train_loss, update_Ws, update_Q],