def test_numpy(X):
    dsize = X.shape[1]
    cov = X @ X.T
    #  cov = cov/np.max(cov)
    precision = np.linalg.inv(cov)

    n = cov.shape[0]
    B = np.zeros((4, 4))
    lr = 0.01
    losses = []
    for i in range(100):
        R = B @ X - X
        G = 2 * R @ X.T
        np.fill_diagonal(G, 0)

        resvar = np.asarray([np.linalg.norm(r)**2 for r in R])
        losses.append(np.sum(resvar))
        D2 = np.diag(1 / resvar)
        precision2 = D2 @ (np.identity(n) - B)

        err = (precision2 - precision)
        loss2 = np.trace(err @ err.T)
        B = B - lr * G
        print(loss2)

    test_points = 10
    losses = np.asarray(losses)[:test_points]
    target_losses = [
        118., 41.150800000000004, 33.539355199999996, 29.747442032320002,
        27.450672271574934, 25.95846376879459, 24.917943341139274,
        24.139761502111114, 23.519544126307142, 22.998235729589265
    ]

    u.check_equal(losses[:test_points], target_losses[:test_points])
    print('mismatch is ', np.max(losses - target_losses))
def test_diagonal_hessian():
    u.seed_random(1)
    A, model = create_toy_model()

    activations = {}

    def save_activations(layer, a, _):
        if layer != model.layers[0]:
            return
        activations[layer] = a

    with autograd_lib.module_hook(save_activations):
        Y = model(A.t())
        loss = torch.sum(Y * Y) / 2

    hess = [0]

    def compute_hess(layer, _, B):
        if layer != model.layers[0]:
            return
        A = activations[layer]
        hess[0] += torch.einsum("ni,nj->ij", B * B, A * A).reshape(-1)

    with autograd_lib.module_hook(compute_hess):
        autograd_lib.backprop_identity(Y, retain_graph=True)

    # check against autograd
    hess0 = u.hessian(loss, model.layers[0].weight).reshape([4, 4])
    u.check_equal(hess[0], torch.diag(hess0))

    # check against manual solution
    u.check_equal(hess[0], [425., 225., 680., 360.])
def test_full_hessian_multibatch():
    A, model = create_toy_model()
    data = A.t()
    data = data.repeat(3, 1)
    n = float(len(data))

    activations = {}
    hess = defaultdict(float)

    def save_activations(layer, a, _):
        activations[layer] = a

    def compute_hessian(layer, _, B):
        A = activations[layer]
        BA = torch.einsum("nl,ni->nli", B, A)
        hess[layer] += torch.einsum('nli,nkj->likj', BA, BA)

    for x in data:
        with autograd_lib.module_hook(save_activations):
            y = model(x)
            loss = torch.sum(y * y) / 2

        with autograd_lib.module_hook(compute_hessian):
            autograd_lib.backprop_identity(y)

    result = hess[model.layers[0]]

    # check result against autograd
    loss = u.least_squares(model(data), aggregation='sum')
    hess0 = u.hessian(loss, model.layers[0].weight)
    u.check_equal(hess0, result)
def test_kfac_hessian():
    A, model = create_toy_model()
    data = A.t()
    data = data.repeat(7, 1)
    n = float(len(data))

    activations = {}
    hess = defaultdict(lambda: AttrDefault(float))

    def save_activations(layer, a, _):
        activations[layer] = a

    def compute_hessian(layer, _, B):
        A = activations[layer]
        hess[layer].AA += torch.einsum("ni,nj->ij", A, A)
        hess[layer].BB += torch.einsum("ni,nj->ij", B, B)

    for x in data:
        with autograd_lib.module_hook(save_activations):
            y = model(x)
            o = y.shape[1]
            loss = torch.sum(y * y) / 2

        with autograd_lib.module_hook(compute_hessian):
            autograd_lib.backprop_identity(y)

    hess0 = hess[model.layers[0]]
    result = u.kron(hess0.BB / n, hess0.AA / o)

    # check result against autograd
    loss = u.least_squares(model(data), aggregation='sum')
    hess0 = u.hessian(loss, model.layers[0].weight).reshape(4, 4)
    u.check_equal(hess0, result)
def test_kfac_jacobian_mnist():
    u.seed_random(1)

    data_width = 3
    d = [data_width**2, 8, 10]
    model: u.SimpleMLP = u.SimpleMLP(d, nonlin=False)
    autograd_lib.register(model)

    batch_size = 4
    stats_steps = 2
    n = batch_size * stats_steps

    dataset = u.TinyMNIST(dataset_size=n,
                          data_width=data_width,
                          original_targets=True)
    trainloader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=False)
    train_iter = iter(trainloader)

    loss_fn = torch.nn.CrossEntropyLoss()

    activations = {}
    jacobians = defaultdict(lambda: AttrDefault(float))
    total_data = []

    # sum up statistics over n examples
    for train_step in range(stats_steps):
        data, targets = next(train_iter)
        total_data.append(data)

        activations = {}

        def save_activations(layer, A, _):
            activations[layer] = A
            jacobians[layer].AA += torch.einsum("ni,nj->ij", A, A)

        with autograd_lib.module_hook(save_activations):
            output = model(data)
            loss = loss_fn(output, targets)

        def compute_jacobian(layer, _, B):
            A = activations[layer]
            jacobians[layer].BB += torch.einsum("ni,nj->ij", B, B)
            jacobians[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A)

        with autograd_lib.module_hook(compute_jacobian):
            autograd_lib.backward_jacobian(output)

    for layer in model.layers:
        jacobian0 = jacobians[layer]
        jacobian_full = torch.einsum('kl,ij->kilj', jacobian0.BB / n,
                                     jacobian0.AA / n)
        jacobian_diag = jacobian0.diag / n

        J = u.jacobian(model(torch.cat(total_data)), layer.weight)
        J_autograd = torch.einsum('noij,nokl->ijkl', J, J) / n
        u.check_equal(jacobian_full, J_autograd)

        u.check_equal(jacobian_diag, torch.einsum('ikik->ik', J_autograd))
def test_gradient_norms():
    """Per-example gradient norms."""
    u.seed_random(1)
    A, model = create_toy_model()

    activations = {}

    def save_activations(layer, a, _):
        if layer != model.layers[0]:
            return
        activations[layer] = a

    with autograd_lib.module_hook(save_activations):
        Y = model(A.t())
        loss = torch.sum(Y * Y) / 2

    norms = {}

    def compute_norms(layer, _, b):
        if layer != model.layers[0]:
            return
        a = activations[layer]
        del activations[layer]  # release memory kept by activations
        norms[layer] = (a * a).sum(dim=1) * (b * b).sum(dim=1)

    with autograd_lib.module_hook(compute_norms):
        loss.backward()

    u.check_equal(norms[model.layers[0]], [3493250, 9708800])
Exemple #7
0
def vargroup_test():
  sess = tf.InteractiveSession()
  v1 = tf.Variable(1.)
  v2 = tf.Variable(1.)
  a = VarList([v1, v2])
  b = a.copy()
  sess.run(tf.global_variables_initializer())
  sess.run(a.sub(b, weight=1))
  u.check_equal(v1.eval(), 0)
def test_kfac_fisher_mnist():
    u.seed_random(1)

    data_width = 3
    d = [data_width**2, 8, 10]
    model: u.SimpleMLP = u.SimpleMLP(d, nonlin=False)
    autograd_lib.register(model)

    batch_size = 4
    stats_steps = 2
    n = batch_size * stats_steps

    dataset = u.TinyMNIST(dataset_size=n,
                          data_width=data_width,
                          original_targets=True)
    trainloader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              shuffle=False)
    train_iter = iter(trainloader)

    loss_fn = torch.nn.CrossEntropyLoss()

    activations = {}
    fishers = defaultdict(lambda: AttrDefault(float))
    total_data = []

    # sum up statistics over n examples
    for train_step in range(stats_steps):
        data, targets = next(train_iter)
        total_data.append(data)

        activations = {}

        def save_activations(layer, A, _):
            activations[layer] = A
            fishers[layer].AA += torch.einsum("ni,nj->ij", A, A)

        with autograd_lib.module_hook(save_activations):
            output = model(data)
            loss = loss_fn(output, targets) * len(
                data)  # remove data normalization

        def compute_fisher(layer, _, B):
            A = activations[layer]
            fishers[layer].BB += torch.einsum("ni,nj->ij", B, B)
            fishers[layer].diag += torch.einsum("ni,nj->ij", B * B, A * A)

        with autograd_lib.module_hook(compute_fisher):
            autograd_lib.backward_jacobian(output)

    for layer in model.layers:
        fisher0 = fishers[layer]
        fisher_full = torch.einsum('kl,ij->kilj', fisher0.BB / n,
                                   fisher0.AA / n)
        fisher_diag = fisher0.diag / n

        u.check_equal(torch.einsum('ikik->ik', fisher_full), fisher_diag)
def test_full_hessian_xent_kfac2():
    """Test with uneven layers."""
    u.seed_random(1)
    torch.set_default_dtype(torch.float64)

    batch_size = 1
    d = [3, 2]
    o = d[-1]
    n = batch_size
    train_steps = 1

    model: u.SimpleModel = u.SimpleFullyConnected2(d, nonlin=True, bias=False)
    autograd_lib.register(model)
    loss_fn = torch.nn.CrossEntropyLoss()

    data = u.to_logits(torch.tensor([[0.7, 0.2, 0.1]]))
    targets = torch.tensor([0])

    data = data.repeat([3, 1])
    targets = targets.repeat([3])
    n = len(data)

    activations = {}
    hess = defaultdict(lambda: AttrDefault(float))

    for i in range(n):

        def save_activations(layer, A, _):
            activations[layer] = A
            hess[layer].AA += torch.einsum("ni,nj->ij", A, A)

        with autograd_lib.module_hook(save_activations):
            data_batch = data[i:i + 1]
            targets_batch = targets[i:i + 1]
            Y = model(data_batch)
            o = Y.shape[1]
            loss = loss_fn(Y, targets_batch)

        def compute_hess(layer, _, B):
            hess[layer].BB += torch.einsum("ni,nj->ij", B, B)

        with autograd_lib.module_hook(compute_hess):
            autograd_lib.backward_hessian(Y, loss='CrossEntropy')

    # expand
    hess_factored = hess[model.layers[0]]
    hess0 = torch.einsum('kl,ij->kilj', hess_factored.BB / n,
                         hess_factored.AA / o)  # hess for sum loss
    hess0 /= n  # hess for mean loss

    # check against autograd
    # 0.1459
    Y = model(data)
    loss = loss_fn(Y, targets)
    hess_autograd = u.hessian(loss, model.layers[0].weight)
    u.check_equal(hess_autograd, hess0)
def test_full_hessian_xent_multibatch():
    u.seed_random(1)
    torch.set_default_dtype(torch.float64)

    batch_size = 1
    d = [2, 2]
    o = d[-1]
    n = batch_size
    train_steps = 1

    model: u.SimpleModel = u.SimpleFullyConnected2(d, nonlin=True, bias=True)
    model.layers[0].weight.data.copy_(torch.eye(2))
    autograd_lib.register(model)
    loss_fn = torch.nn.CrossEntropyLoss()

    data = u.to_logits(torch.tensor([[0.7, 0.3]]))
    targets = torch.tensor([0])

    data = data.repeat([3, 1])
    targets = targets.repeat([3])
    n = len(data)

    activations = {}
    hess = defaultdict(float)

    def save_activations(layer, a, _):
        activations[layer] = a

    for i in range(n):
        with autograd_lib.module_hook(save_activations):
            data_batch = data[i:i + 1]
            targets_batch = targets[i:i + 1]
            Y = model(data_batch)
            loss = loss_fn(Y, targets_batch)

        def compute_hess(layer, _, B):
            A = activations[layer]
            BA = torch.einsum("nl,ni->nli", B, A)
            hess[layer] += torch.einsum('nli,nkj->likj', BA, BA)

        with autograd_lib.module_hook(compute_hess):
            autograd_lib.backward_hessian(Y, loss='CrossEntropy')

    # check against autograd
    # 0.1459
    Y = model(data)
    loss = loss_fn(Y, targets)
    hess_autograd = u.hessian(loss, model.layers[0].weight)
    hess0 = hess[model.layers[0]] / n
    u.check_equal(hess_autograd, hess0)
def test_full_hessian():
    u.seed_random(1)
    A, model = create_toy_model()
    data = A.t()
    #    data = data.repeat(3, 1)
    activations = {}

    hess = defaultdict(float)

    def save_activations(layer, a, _):
        activations[layer] = a

    with autograd_lib.module_hook(save_activations):
        Y = model(A.t())
        loss = torch.sum(Y * Y) / 2

    def compute_hess(layer, _, B):
        A = activations[layer]
        n = A.shape[0]

        di = A.shape[1]
        do = B.shape[1]

        BA = torch.einsum("nl,ni->nli", B, A)
        hess[layer] += torch.einsum('nli,nkj->likj', BA, BA)

    with autograd_lib.module_hook(compute_hess):
        autograd_lib.backprop_identity(Y, retain_graph=True)

    # check against autograd
    hess_autograd = u.hessian(loss, model.layers[0].weight)
    hess0 = hess[model.layers[0]]
    u.check_equal(hess_autograd, hess0)

    # check against manual solution
    u.check_equal(hess0.reshape(4, 4),
                  [[425, -75, 170, -30], [-75, 225, -30, 90],
                   [170, -30, 680, -120], [-30, 90, -120, 360]])
Exemple #12
0
def simple_newton_kfac_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # inverse Hessian blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      # reuse Hess tensor calculation in order to get off-diag block sizes
      dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        acov = A[i] @ t(A[j])
        bcov = Bn[i] @ t(Bn[j]) / dsize;
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
      iblocks[i][j]=term
        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]
    
  ihess = u.concat_blocks(iblocks)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv",
                               delimiter= ",")
  observed_losses = []

  # from accompanying notebook
  #  {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, 
  # 6.88354*10^-33,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Exemple #13
0
def sparse_autoencoder_cost_tf(theta, visible_size, hidden_size, lambda_,
                               sparsity_param, beta, data):

    # TODO: get rid of b
    #  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
    #  fs = [10000, 28*28, 196, 28*28]
    #  n = len(fs)-2
    #  W = [None]*n

    W1 = theta[0:hidden_size * visible_size].reshape(hidden_size,
                                                     visible_size,
                                                     order='F')
    W2 = theta[hidden_size * visible_size:2 * hidden_size *
               visible_size].reshape(visible_size, hidden_size, order='F')
    b1 = theta[2 * hidden_size * visible_size:2 * hidden_size * visible_size +
               hidden_size]
    b2 = theta[2 * hidden_size * visible_size + hidden_size:]

    init_dict = {}

    def init_var(val, name):
        holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder")
        var = tf.Variable(holder, name=name + "_var")
        init_dict[holder] = val
        return var

    W1_ = init_var(W1, "W1")
    W2_ = init_var(W2, "W2")
    b1_ = init_var(u.v2c_np(b1), "b1")
    b2_ = init_var(u.v2c_np(b2), "b2")
    data_ = init_var(data, "data")
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # Number of training examples
    m = data.shape[1]
    a1 = data
    a1_ = data_

    # Forward propagation
    z2 = W1.dot(a1) + np.tile(b1, (m, 1)).transpose()
    z2_ = tf.matmul(W1_, a1_) + b1_

    a2 = sigmoid(z2)
    a2_ = tf.sigmoid(z2_)
    z3 = W2.dot(a2) + np.tile(b2, (m, 1)).transpose()
    z3_ = tf.matmul(W2_, a2_) + b2_
    a3 = sigmoid(z3)
    a3_ = tf.sigmoid(z3_)

    # Sparsity
    rho_hat = np.sum(a2, axis=1) / m
    rho_hat_ = tf.reduce_sum(a2_, axis=1, keep_dims=True) / m
    rho = np.tile(sparsity_param, hidden_size)
    # ValueError: Shape must be rank 1 but is rank 0 for 'Tile_2' (op: 'Tile') with input shapes: [], [].
    rho_ = tf.constant(sparsity_param, dtype=dtype)
    #tf.ones((hidden_size, 1), dtype=dtype)*sparsity_param

    u.check_equal(sess.run(a3_), a3)
    u.check_equal(sess.run(a2_), a2)
    u.check_equal(sess.run(a1_), a1)
    u.check_equal(
        tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_)).eval(),
        np.sum(KL_divergence(rho, rho_hat)))

    # Cost function
    cost = np.sum((a3 - a1) ** 2) / (2 * m) + \
           (lambda_ / 2) * (np.sum(W1 ** 2) + np.sum(W2 ** 2)) + \
           beta * np.sum(KL_divergence(rho, rho_hat))
    cost_ = tf.reduce_sum((a3_ - a1_) ** 2) / (2 * m) + \
            (lambda_ / 2) * (tf.reduce_sum(W1_ ** 2) + \
                             tf.reduce_sum(W2_ ** 2)) + \
                             beta * tf.reduce_sum(KL_divergence_tf(rho_, rho_hat_))
    return sess.run(cost_)
Exemple #14
0
def relu_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_relu_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [4,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0, name="X0")
  Y = tf.constant(Y0, name="Y0")
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    if i == 0:
      A[i+1] = X
    else:
      A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1)))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.1, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = (-err/dsize)*u.relu_mask(A[n+1])
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    if i > 0:  # there's no relu on first matrix
      B[i] = B[i]*u.relu_mask(A[i+1])

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # From accompanying notebook
  #  {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892,
  #    0.00156137, 0.00153857, 0.00153051, 0.00152593}
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Exemple #15
0
def simple_newton_bd_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]

  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

  # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]
    
  #hess = u.concat_blocks(blocks)
  ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))
  #  ihess = u.pseudo_inverse(hess)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # from accompanying notebook
  # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, 
  # 1.21455*10^-32,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
        def compare(a, b, msg):
            infnorm = np.linalg.norm(a - b, np.inf)
            l2norm = np.linalg.norm(a - b)
            m = min(np.max(a), np.max(b))
            print("Dtype %s, %s, l_inf %s, l2 %s" %
                  (dtype_name, msg, infnorm / eps, l2norm / eps))

        assert len(autograds) == n + 1
        for i in range(1, n + 1):
            op = autograds[i].op
            assert op.op_def.name == 'MatMul'
            assert op.get_attr("transpose_a") == False
            assert op.get_attr("transpose_b") == True
            autoB = op.inputs[0]
            autoA = op.inputs[1]
            u.check_equal(op.inputs[1], A[i], rtol=1e-5, atol=1e-7)
            u.check_equal(autograds[i], dW[i], rtol=1e-5, atol=1e-7)
            u.check_equal(op.inputs[0], B[i] / dsize, rtol=1e-6, atol=1e-7)
            compare(A[i].eval(), autoA.eval(), "A[%d]" % (i, ))
            compare((B[i] / dsize).eval(), autoB.eval(), "B[%d]" % (i, ))
            compare(dW[i].eval(), autograds[i].eval(), "dW[%d]" % (i, ))

        lr0, loss0 = sess.run([lr, loss])
        save_params_op.run()
        util.dump32(Wf_copy, "%s_param_%d" % (prefix, step))
        util.dump32(grad, "%s_grad_%d" % (prefix, step))
        util.dump32(pre_grad, "%s_pre_grad_%d" % (prefix, step))

        #    util.dump32(A[1], "%s_param_%d"%(prefix, step))

        # regular inverse becomes unstable when grad norm exceeds 1
Exemple #17
0
def simple_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))


  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(1.0, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  # from accompanying notebook
  # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361,
  #  0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772,
  for i in range(20):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
Exemple #19
0
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    [v.initializer.run() for v in opt_vars]

    losses = []
    u.record_time()

    for i in range(num_steps):
        loss0 = model.loss.eval()
        losses.append(loss0)
        print("Loss ", loss0)

        kfac.model.advance_batch()
        kfac.update_stats()

        model.advance_batch()
        grad.update()
        grad_new.update()
        train_op.run()

        u.record_time()

    if len(sys.argv) > 1 and sys.argv[1] == 'record':
        u.dump(losses, prefix + "_losses.csv")
        sys.exit()

    u.summarize_time()
    targets = np.loadtxt("data/kfac_refactor_test7_losses.csv", delimiter=",")
    print("Difference is ", np.linalg.norm(np.asarray(losses) - targets))
    result = u.check_equal(losses, targets, rtol=1e-4)
    print("Test passed: %s" % (result, ))
Exemple #20
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    logger = u.TensorboardLogger(args.run)

    with u.timeit("init/session"):

        rewrite_options = None
        try:
            from tensorflow.core.protobuf import rewriter_config_pb2
            rewrite_options = rewriter_config_pb2.RewriterConfig(
                disable_model_pruning=True,
                constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
                memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
        except:
            pass

        optimizer_options = tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0)
        graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                        rewrite_options=rewrite_options)
        gpu_options = tf.GPUOptions(allow_growth=False)
        config = tf.ConfigProto(graph_options=graph_options,
                                gpu_options=gpu_options,
                                log_device_placement=False)

        sess = tf.InteractiveSession(config=config)
        u.register_default_session(
            sess)  # since default session is Thread-local

    with u.timeit("init/model_init"):
        model = model_creator(args.batch_size, name="main")
        model.initialize_global_vars(verbose=True)
        model.initialize_local_vars()

    kfac_lib.numeric_inverse = args.numeric_inverse
    with u.timeit("init/kfac_init"):
        kfac = Kfac(model_creator, args.kfac_batch_size)
        kfac.model.initialize_global_vars(verbose=False)
        kfac.model.initialize_local_vars()
        kfac.Lambda.set(args.Lambda)
        kfac.reset()  # resets optimization variables (not model variables)

    if args.mode != 'run':
        opt = tf.train.AdamOptimizer(0.001)
    else:
        opt = tf.train.AdamOptimizer(args.lr)
    grads_and_vars = opt.compute_gradients(model.loss,
                                           var_list=model.trainable_vars)

    grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
    grad_new = kfac.correct(grad)
    with u.capture_vars() as adam_vars:
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    with u.timeit("init/adam"):
        sessrun([v.initializer for v in adam_vars])

    losses = []
    u.record_time()

    start_time = time.time()
    vloss0 = 0

    # todo, unify the two data outputs
    outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda)

    start_time = time.time()
    if args.extra_kfac_batch_advance:
        kfac.model.advance_batch()  # advance kfac batch

    if args.kfac_async:
        kfac.start_stats_runners()

    for step in range(args.num_steps):

        if args.validate_every_n and step % args.validate_every_n == 0:
            loss0, vloss0 = sessrun([model.loss, model.vloss])
        else:
            loss0, = sessrun([model.loss])
        losses.append(loss0)  # TODO: remove this

        logger('loss/loss', loss0, 'loss/vloss', vloss0)

        elapsed = time.time() - start_time
        start_time = time.time()
        print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" %
              (elapsed * 1e3, step, loss0, vloss0))

        if args.method == 'kfac' and not args.kfac_async:
            kfac.model.advance_batch()
            kfac.update_stats()

        with u.timeit("train"):
            model.advance_batch()
            with u.timeit("grad.update"):
                grad.update()
            with kfac.read_lock():
                grad_new.update()
            u.run(train_op)
            u.record_time()

        logger.next_step()

    # TODO: use u.global_runs_dir
    # TODO: get rid of u.timeit?

    with open('timelines/graphdef.txt', 'w') as f:
        f.write(str(u.get_default_graph().as_graph_def()))

    u.summarize_time()

    if args.mode == 'record':
        u.dump_with_prompt(losses, release_test_fn)

    elif args.mode == 'test':
        targets = np.loadtxt('data/' + release_test_fn, delimiter=",")
        u.check_equal(losses, targets, rtol=1e-2)
        u.summarize_difference(losses, targets)
        assert u.last_time() < 800, "Expected 648 on GTX 1080"
Exemple #21
0
def main():
  np.random.seed(args.seed)
  tf.set_random_seed(args.seed)

  logger = u.TensorboardLogger(args.run)
  
  with u.timeit("init/session"):
    gpu_options = tf.GPUOptions(allow_growth=False)
    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
    u.register_default_session(sess)   # since default session is Thread-local

  with u.timeit("init/model_init"):
    model = model_creator(args.batch_size, name="main")
    model.initialize_global_vars(verbose=True)
    model.initialize_local_vars()
  
  with u.timeit("init/kfac_init"):
    kfac = Kfac(model_creator, args.kfac_batch_size) 
    kfac.model.initialize_global_vars(verbose=False)
    kfac.model.initialize_local_vars()
    kfac.Lambda.set(args.Lambda)
    kfac.reset()    # resets optimization variables (not model variables)

  if args.mode != 'run':
    opt = tf.train.AdamOptimizer(0.001)
  else:
    opt = tf.train.AdamOptimizer(args.lr)
  grads_and_vars = opt.compute_gradients(model.loss,
                                         var_list=model.trainable_vars)
      
  grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
  grad_new = kfac.correct(grad)
  with u.capture_vars() as adam_vars:
    train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
  with u.timeit("init/adam"):
    sessrun([v.initializer for v in adam_vars])
  
  losses = []
  u.record_time()

  start_time = time.time()
  vloss0 = 0

  # todo, unify the two data outputs
  outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda)
  writer = u.BufferedWriter(outfn, 60)   # get rid?

  start_time = time.time()
  if args.extra_kfac_batch_advance:
    kfac.model.advance_batch()  # advance kfac batch

  if args.kfac_async:
    kfac.start_stats_runners()
    
  for step in range(args.num_steps):
    
    if args.validate_every_n and step%args.validate_every_n == 0:
      loss0, vloss0 = sessrun([model.loss, model.vloss])
    else:
      loss0, = sessrun([model.loss])
    losses.append(loss0)  # TODO: remove this

    logger('loss/loss', loss0, 'loss/vloss', vloss0)
    
    elapsed = time.time()-start_time
    print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0,
                                                     vloss0))
    writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0))

    if args.method=='kfac' and not args.kfac_async:
      kfac.model.advance_batch()
      kfac.update_stats()

    with u.timeit("train"):
      model.advance_batch()
      grad.update()
      with kfac.read_lock():
        grad_new.update()
      train_op.run()
      u.record_time()

    logger.next_step()

  # TODO: use u.global_runs_dir
  # TODO: get rid of u.timeit?
  
  with open('timelines/graphdef.txt', 'w') as f:
    f.write(str(u.get_default_graph().as_graph_def()))

  u.summarize_time()
  
  if args.mode == 'record':
    u.dump_with_prompt(losses, release_test_fn)

  elif args.mode == 'test':
    targets = np.loadtxt('data/'+release_test_fn, delimiter=",")
    u.check_equal(losses, targets, rtol=1e-2)
    u.summarize_difference(losses, targets)
Exemple #22
0
def main():

    u.seed_random(1)
    logdir = u.create_local_logdir(args.logdir)
    run_name = os.path.basename(logdir)
    gl.event_writer = SummaryWriter(logdir)
    print(f"Logging to {run_name}")

    d1 = args.data_width ** 2
    assert args.data_width == args.targets_width
    o = d1
    n = args.stats_batch_size
    d = [d1, 30, 30, 30, 20, 30, 30, 30, d1]

    # small values for debugging
    # loss_type = 'LeastSquares'
    loss_type = 'CrossEntropy'

    args.wandb = 0
    args.stats_steps = 10
    args.train_steps = 10
    args.stats_batch_size = 10
    args.data_width = 2
    args.targets_width = 2
    args.nonlin = False
    d1 = args.data_width ** 2
    d2 = 2
    d3 = args.targets_width ** 2

    if loss_type == 'CrossEntropy':
        d3 = 10
    o = d3
    n = args.stats_batch_size
    d = [d1, d2, d3]
    dsize = max(args.train_batch_size, args.stats_batch_size)+1

    model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin)
    model = model.to(gl.device)

    try:
        # os.environ['WANDB_SILENT'] = 'true'
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name)
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['method'] = args.method
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    #optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.03)  # make 10x smaller for least-squares loss
    dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, dataset_size=dsize, original_targets=True)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_iter = None
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(dataset, batch_size=args.stats_batch_size, shuffle=False, drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)

    test_dataset = u.TinyMNIST(data_width=args.data_width, targets_width=args.targets_width, train=False, dataset_size=dsize, original_targets=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.train_batch_size, shuffle=False, drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    elif loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.token_count = 0
    last_outer = 0
    val_losses = []
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars({"time/outer": 1000*(time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            print("val_loss", val_loss.item())
            val_losses.append(val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        # compute stats
        if args.full_batch:
            data, targets = dataset.data, dataset.targets
        else:
            data, targets = next(stats_iter)

        # Capture Hessian and gradient stats
        autograd_lib.enable_hooks()
        autograd_lib.clear_backprops(model)
        autograd_lib.clear_hess_backprops(model)
        with u.timeit("backprop_g"):
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)
        with u.timeit("backprop_H"):
            autograd_lib.backprop_hess(output, hess_type=loss_type)
        autograd_lib.disable_hooks()   # TODO(y): use remove_hooks

        with u.timeit("compute_grad1"):
            autograd_lib.compute_grad1(model)
        with u.timeit("compute_hess"):
            autograd_lib.compute_hess(model)

        for (i, layer) in enumerate(model.layers):

            # input/output layers are unreasonably expensive if not using Kronecker factoring
            if d[i]>50 or d[i+1]>50:
                print(f'layer {i} is too big ({d[i],d[i+1]}), skipping stats')
                continue

            if args.skip_stats:
                continue

            s = AttrDefault(str, {})  # dictionary-like object for layer stats

            #############################
            # Gradient stats
            #############################
            A_t = layer.activations
            assert A_t.shape == (n, d[i])

            # add factor of n because backprop takes loss averaged over batch, while we need per-example loss
            B_t = layer.backprops_list[0] * n
            assert B_t.shape == (n, d[i + 1])

            with u.timeit(f"khatri_g-{i}"):
                G = u.khatri_rao_t(B_t, A_t)  # batch loss Jacobian
            assert G.shape == (n, d[i] * d[i + 1])
            g = G.sum(dim=0, keepdim=True) / n  # average gradient
            assert g.shape == (1, d[i] * d[i + 1])

            u.check_equal(G.reshape(layer.weight.grad1.shape), layer.weight.grad1)

            if args.autograd_check:
                u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad)
                u.check_close(g.reshape(d[i + 1], d[i]), layer.weight.saved_grad)

            s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel()  # proportion of activations that are zero
            s.mean_activation = torch.mean(A_t)
            s.mean_backprop = torch.mean(B_t)

            # empirical Fisher
            with u.timeit(f'sigma-{i}'):
                efisher = G.t() @ G / n
                sigma = efisher - g.t() @ g
                s.sigma_l2 = u.sym_l2_norm(sigma)
                s.sigma_erank = torch.trace(sigma)/s.sigma_l2

            lambda_regularizer = args.lmb * torch.eye(d[i + 1]*d[i]).to(gl.device)
            H = layer.weight.hess

            with u.timeit(f"invH-{i}"):
                invH = torch.cholesky_inverse(H+lambda_regularizer)

            with u.timeit(f"H_l2-{i}"):
                s.H_l2 = u.sym_l2_norm(H)
                s.iH_l2 = u.sym_l2_norm(invH)

            with u.timeit(f"norms-{i}"):
                s.H_fro = H.flatten().norm()
                s.iH_fro = invH.flatten().norm()
                s.grad_fro = g.flatten().norm()
                s.param_fro = layer.weight.data.flatten().norm()

            u.nan_check(H)
            if args.autograd_check:
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                H_autograd = u.hessian(loss, layer.weight)
                H_autograd = H_autograd.reshape(d[i] * d[i + 1], d[i] * d[i + 1])
                u.check_close(H, H_autograd)

            #  u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}')
            def loss_direction(dd: torch.Tensor, eps):
                """loss improvement if we take step eps in direction dd"""
                return u.to_python_scalar(eps * (dd @ g.t()) - 0.5 * eps ** 2 * dd @ H @ dd.t())

            def curv_direction(dd: torch.Tensor):
                """Curvature in direction dd"""
                return u.to_python_scalar(dd @ H @ dd.t() / (dd.flatten().norm() ** 2))

            with u.timeit(f"pinvH-{i}"):
                pinvH = u.pinv(H)

            with u.timeit(f'curv-{i}'):
                s.grad_curv = curv_direction(g)
                ndir = g @ pinvH  # newton direction
                s.newton_curv = curv_direction(ndir)
                setattr(layer.weight, 'pre', pinvH)  # save Newton preconditioner
                s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999
                s.step_max = 2 / s.H_l2
                s.step_min = torch.tensor(2) / torch.trace(H)

                s.newton_fro = ndir.flatten().norm()  # frobenius norm of Newton update
                s.regret_newton = u.to_python_scalar(g @ pinvH @ g.t() / 2)   # replace with "quadratic_form"
                s.regret_gradient = loss_direction(g, s.step_openai)

            with u.timeit(f'rho-{i}'):
                p_sigma = u.lyapunov_svd(H, sigma)
                if u.has_nan(p_sigma) and args.compute_rho:  # use expensive method
                    print('using expensive method')
                    import pdb; pdb.set_trace()
                    H0, sigma0 = u.to_numpys(H, sigma)
                    p_sigma = scipy.linalg.solve_lyapunov(H0, sigma0)
                    p_sigma = torch.tensor(p_sigma).to(gl.device)

                if u.has_nan(p_sigma):
                    # import pdb; pdb.set_trace()
                    s.psigma_erank = H.shape[0]
                    s.rho = 1
                else:
                    s.psigma_erank = u.sym_erank(p_sigma)
                    s.rho = H.shape[0] / s.psigma_erank

            with u.timeit(f"batch-{i}"):
                s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                s.diversity = torch.norm(G, "fro") ** 2 / torch.norm(g) ** 2 / n

                # Faster approaches for noise variance computation
                # s.noise_variance = torch.trace(H.inverse() @ sigma)
                # try:
                #     # this fails with singular sigma
                #     s.noise_variance = torch.trace(torch.solve(sigma, H)[0])
                #     # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0])
                #     pass
                # except RuntimeError as _:
                s.noise_variance_pinv = torch.trace(pinvH @ sigma)

                s.H_erank = torch.trace(H) / s.H_l2
                s.batch_jain_simple = 1 + s.H_erank
                s.batch_jain_full = 1 + s.rho * s.H_erank

            u.log_scalars(u.nest_stats(layer.name, s))

        # gradient steps
        with u.timeit('inner'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                #            u.log_scalar(train_loss=loss.item())

                if args.method != 'newton':
                    optimizer.step()
                    if args.weight_decay:
                        for group in optimizer.param_groups:
                            for param in group['params']:
                                param.data.mul_(1-args.weight_decay)
                else:
                    for (layer_idx, layer) in enumerate(model.layers):
                        param: torch.nn.Parameter = layer.weight
                        param_data: torch.Tensor = param.data
                        param_data.copy_(param_data - 0.1 * param.grad)
                        if layer_idx != 1:  # only update 1 layer with Newton, unstable otherwise
                            continue
                        u.nan_check(layer.weight.pre)
                        u.nan_check(param.grad.flatten())
                        u.nan_check(u.v2r(param.grad.flatten()) @ layer.weight.pre)
                        param_new_flat = u.v2r(param_data.flatten()) - u.v2r(param.grad.flatten()) @ layer.weight.pre
                        u.nan_check(param_new_flat)
                        param_data.copy_(param_new_flat.reshape(param_data.shape))

                gl.token_count += data.shape[0]

    gl.event_writer.close()
Exemple #23
0

    if step % report_frequency == 0:
      print("Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"%(step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval()))
    
    if adaptive_step_frequency and adaptive_step and step>adaptive_step_burn_in:
      # shrink if wrong prediction, don't shrink if prediction is tiny
      if slope_ratio < alpha and abs(target_delta)>1e-6 and adaptive_step:
        print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio))
        print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,))
        sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*beta})
        
      # grow learning rate, slope_ratio .99 worked best for gradient
      elif step>0 and i%50 == 0 and slope_ratio>0.90 and adaptive_step:
          print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio))
          print("Growing learning rate to %.2f"%(lr0*growth_rate))
          sess.run(vard[lr].setter, feed_dict={vard[lr].p:
                                               lr0*growth_rate})

    u.record_time()
  u.summarize_time()
  if len(sys.argv)>1 and sys.argv[1]=='record':
    u.dump(losses, prefix+"_losses.csv")
    sys.exit()
    
  targets = np.loadtxt("data/"+prefix+"_losses.csv", delimiter=",")
  print("Difference is ", np.linalg.norm(np.asarray(losses)-targets))
  u.check_equal(losses, targets, rtol=1e-5)
  print("Test passed")

Exemple #24
0
def test_conv_grad():
    """Test per-example gradient computation for conv layer.

    """

    u.seed_random(1)
    N, Xc, Xh, Xw = 3, 2, 3, 7
    dd = [Xc, 2]

    Kh, Kw = 2, 3
    Oh, Ow = Xh - Kh + 1, Xw - Kw + 1
    model = u.SimpleConvolutional(dd, kernel_size=(Kh, Kw), bias=True).double()

    weight_buffer = model.layers[0].weight.data

    # output channels, input channels, height, width
    assert weight_buffer.shape == (dd[1], dd[0], Kh, Kw)

    input_dims = N, Xc, Xh, Xw
    size = int(np.prod(input_dims))
    X = torch.arange(0, size).reshape(*input_dims).double()

    def loss_fn(data):
        err = data.reshape(len(data), -1)
        return torch.sum(err * err) / 2 / len(data)

    layer = model.layers[0]
    output = model(X)
    loss = loss_fn(output)
    loss.backward()

    u.check_equal(layer.activations, X)

    assert layer.backprops_list[0].shape == layer.output.shape
    assert layer.output.shape == (N, dd[1], Oh, Ow)

    out_unf = layer.weight.view(layer.weight.size(0), -1) @ unfold(layer.activations, (Kh, Kw))
    assert out_unf.shape == (N, dd[1], Oh * Ow)
    reshaped_bias = layer.bias.reshape(1, dd[1], 1)  # (Co,) -> (1, Co, 1)
    out_unf = out_unf + reshaped_bias

    u.check_equal(fold(out_unf, (Oh, Ow), (1, 1)), output)  # two alternative ways of reshaping
    u.check_equal(out_unf.view(N, dd[1], Oh, Ow), output)

    # Unfold produces patches with output dimension merged, while in backprop they are not merged
    # Hence merge the output (width/height) dimension
    assert unfold(layer.activations, (Kh, Kw)).shape == (N, Xc * Kh * Kw, Oh * Ow)
    assert layer.backprops_list[0].shape == (N, dd[1], Oh, Ow)

    grads_bias = layer.backprops_list[0].sum(dim=(2, 3)) * N
    mean_grad_bias = grads_bias.sum(dim=0) / N
    u.check_equal(mean_grad_bias, layer.bias.grad)

    Bt = layer.backprops_list[0] * N   # remove factor of N applied during loss batch averaging
    assert Bt.shape == (N, dd[1], Oh, Ow)
    Bt = Bt.reshape(N, dd[1], Oh*Ow)
    At = unfold(layer.activations, (Kh, Kw))
    assert At.shape == (N, dd[0] * Kh * Kw, Oh*Ow)

    grad_unf = torch.einsum('ijk,ilk->ijl', Bt, At)
    assert grad_unf.shape == (N, dd[1], dd[0] * Kh * Kw)

    grads = grad_unf.reshape((N, dd[1], dd[0], Kh, Kw))
    u.check_equal(grads.mean(dim=0), layer.weight.grad)

    # compute per-example gradients using autograd, compare against manual computation
    for i in range(N):
        u.clear_backprops(model)
        output = model(X[i:i + 1, ...])
        loss = loss_fn(output)
        loss.backward()
        u.check_equal(grads[i], layer.weight.grad)
        u.check_equal(grads_bias[i], layer.bias.grad)
Exemple #25
0
    if len(sys.argv) > 1 and sys.argv[1] == 'maketest':
        print("Generating test data.")
        costs = []
        for i in range(10):
            cost0, _ = sess.run([cost, train_op])
            costs.append(cost0)
        open("data/train_losses.csv", "w").write(str(costs))

    elif len(sys.argv) > 1 and sys.argv[1] == 'test':
        print("Running self test.")
        costs = []
        for i in range(10):
            cost0, _ = sess.run([cost, train_op])
            costs.append(cost0)
        u.check_equal(costs,
                      eval(open("data/train_losses.csv").read()),
                      rtol=1e-3,
                      atol=1e-5)
        print("Test passed")

    else:
        print("Running training.")
        do_images = True
        u.reset_time()
        old_cost = sess.run(cost)
        old_i = 0
        frame_count = 0
        if do_images:
            os.system("rm pics/weights*.png")
        for i in range(1000):
            cost0, _ = sess.run([cost, train_op])
            if i % 100 == 0:
Exemple #26
0
def test_explicit_hessian():
    """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf


    """

    torch.set_default_dtype(torch.float64)
    A = torch.tensor([[-1., 4], [3, 0]])
    B = torch.tensor([[-4., 3], [2, 6]])
    X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True)

    Y = B.t() @ X @ A
    u.check_equal(Y, [[-52, 64], [-81, -108]])
    loss = torch.sum(Y * Y) / 2
    hess0 = u.hessian(loss, X).reshape([4, 4])
    hess1 = u.Kron(A @ A.t(), B @ B.t())

    u.check_equal(loss, 12512.5)

    # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas
    # autograd_lib uses math convention and does column-vectorized.
    # Commuting order of Kronecker product switches between two representations
    u.check_equal(hess1.commute(), hess0)

    # Do a test using Linear layers instead of matrix multiplies
    model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False)
    model.layers[0].weight.data.copy_(X)

    # Transpose to match previous results, layers treat dim0 as batch dimension
    u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]])  # XA = (A'X0)'

    model.layers[1].weight.data.copy_(B.t())
    u.check_equal(model(A.t()).t(), Y)

    Y = model(A.t()).t()    # transpose to data-dimension=columns
    loss = torch.sum(Y * Y) / 2
    loss.backward()

    u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]])
    G = B @ Y @ A.t()
    u.check_equal(model.layers[0].weight.grad, G)

    u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t()))

    # compute newton step
    u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6]))

    # compute Newton step using factored representation
    autograd_lib.add_hooks(model)

    Y = model(A.t())
    n = 2
    loss = torch.sum(Y * Y) / 2
    autograd_lib.backprop_hess(Y, hess_type='LeastSquares')
    autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum')
    param = model.layers[0].weight

    hess2 = param.hess_kron
    print(hess2)

    u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]])

    # Gradient test
    model.zero_grad()
    loss.backward()
    u.check_close(u.vec(G).flatten(), u.Vec(param.grad))

    # Newton step test
    # Method 0: PyTorch native autograd
    newton_step0 = param.grad.flatten() @ torch.pinverse(hess0)
    newton_step0 = newton_step0.reshape(param.shape)
    u.check_equal(newton_step0, [[-5, 0], [-2, -6]])

    # Method 1: colummn major order
    ihess2 = hess2.pinv()
    u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]])
    u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]])
    u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6])
    newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form()

    # Method2: row major order
    ihess2_rowmajor = ihess2.commute()
    newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad)
    newton_step2 = newton_step2.matrix_form()

    u.check_equal(newton_step0, newton_step1)
    u.check_equal(newton_step0, newton_step2)
Exemple #27
0
    B[i] = t(W[i+1]) @ B[i+1]
    
  sess = tf.InteractiveSession()
  sess.run(tf.global_variables_initializer())

  # manually created gradients
  dW = [None]*(n+1)
  for i in range(n+1):
    dW[i] = -(B[i] @ t(A[i]))/dsize


  # automatic gradients
  grads = tf.gradients(loss, W)

  for i in range(n+1):
    u.check_equal(dW[i].eval(), grads[i].eval())

  # first backprop is special since it's right input of matmul
  op = grads[0].op
  assert op.op_def.name == 'MatMul'
  assert op.get_attr("transpose_a") == True
  assert op.get_attr("transpose_b") == False
  u.check_equal(op.inputs[0], W[1])
  u.check_equal(op.inputs[1], -B[1]/dsize)
    
  for i in range(1, n+1):
    op = grads[i].op
    assert op.op_def.name == 'MatMul'
    assert op.get_attr("transpose_a") == False
    assert op.get_attr("transpose_b") == True
    u.check_equal(op.inputs[0], -B[i]/dsize)
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
          print("Growing learning rate to %.2f"%(lr0*growth_rate))
          sess.run(vard[lr].setter, feed_dict={vard[lr].p:
                                               lr0*growth_rate})

    u.record_time()

  # check against expected loss
  if 'Apple' in sys.version:
    pass
    #    u.dump(losses, "kfac_small_final_mac.csv")
    targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",")
  else:
    pass
    #    u.dump(losses, "kfac_small_final_linux.csv")
    targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",")

  if len(sys.argv)>1 and sys.argv[1]=="test":
    # GPU losses are quite noisy, set rtol high
    u.check_equal(targets, losses[:len(targets)], rtol=1e-3)
    
  u.dump(losses, "%s_losses_%d.csv"%(prefix ,whitening_mode,))
  u.dump(step_lengths, "%s_step_lengths_%d.csv"%(prefix, whitening_mode,))
  u.dump(ratios, "%s_ratios_%d.csv"%(prefix, whitening_mode,))
  u.dump(grad_norms, "%s_grad_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(pre_grad_norms, "%s_pre_grad_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(pre_grad_stable_norms, "%s_pre_grad_stable_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(target_delta_list, "%s_target_delta_%d.csv"%(prefix, whitening_mode,))
  u.dump(target_delta2_list, "%s_target_delta2_%d.csv"%(prefix, whitening_mode,))
  u.dump(actual_delta_list, "%s_actual_delta_%d.csv"%(prefix, whitening_mode,))
  u.summarize_time()
def rotations1_gradient_test():
    #  https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c
    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=","))

    fs = np.genfromtxt('data/large_rotations1_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv')
    lr = tf.Variable(lr0, dtype=dtype)

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv",
                                 delimiter=",")
    observed_losses = []
    # from accompanying notebook
    # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379,
    for i in range(10):
        observed_losses.append(sess.run([loss])[0])
        sess.run(train_op1)
        sess.run(train_op2)

    u.check_equal(observed_losses, expected_losses)
def test_main():

    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')

    parser.add_argument('--wandb',
                        type=int,
                        default=1,
                        help='log to weights and biases')
    parser.add_argument('--autograd_check',
                        type=int,
                        default=0,
                        help='autograd correctness checks')
    parser.add_argument('--logdir',
                        type=str,
                        default='/temp/runs/curv_train_tiny/run')

    parser.add_argument('--train_batch_size', type=int, default=100)
    parser.add_argument('--stats_batch_size', type=int, default=60000)
    parser.add_argument('--dataset_size', type=int, default=60000)
    parser.add_argument('--train_steps',
                        type=int,
                        default=100,
                        help="this many train steps between stat collection")
    parser.add_argument('--stats_steps',
                        type=int,
                        default=1000000,
                        help="total number of curvature stats collections")
    parser.add_argument('--nonlin',
                        type=int,
                        default=1,
                        help="whether to add ReLU nonlinearity between layers")
    parser.add_argument('--method',
                        type=str,
                        choices=['gradient', 'newton'],
                        default='gradient',
                        help="descent method, newton or gradient")
    parser.add_argument('--layer',
                        type=int,
                        default=-1,
                        help="restrict updates to this layer")
    parser.add_argument('--data_width', type=int, default=28)
    parser.add_argument('--targets_width', type=int, default=28)
    parser.add_argument('--lmb', type=float, default=1e-3)
    parser.add_argument(
        '--hess_samples',
        type=int,
        default=1,
        help='number of samples when sub-sampling outputs, 0 for exact hessian'
    )
    parser.add_argument('--hess_kfac',
                        type=int,
                        default=0,
                        help='whether to use KFAC approximation for hessian')
    parser.add_argument('--compute_rho',
                        type=int,
                        default=1,
                        help='use expensive method to compute rho')
    parser.add_argument('--skip_stats',
                        type=int,
                        default=0,
                        help='skip all stats collection')
    parser.add_argument('--full_batch',
                        type=int,
                        default=0,
                        help='do stats on the whole dataset')
    parser.add_argument('--weight_decay', type=float, default=1e-4)

    #args = parser.parse_args()
    args = AttrDict()
    args.lmb = 1e-3
    args.compute_rho = 1
    args.weight_decay = 1e-4
    args.method = 'gradient'
    args.logdir = '/tmp'
    args.data_width = 2
    args.targets_width = 2
    args.train_batch_size = 10
    args.full_batch = False
    args.skip_stats = False
    args.autograd_check = False

    u.seed_random(1)
    logdir = u.create_local_logdir(args.logdir)
    run_name = os.path.basename(logdir)
    #gl.event_writer = SummaryWriter(logdir)
    gl.event_writer = u.NoOp()
    # print(f"Logging to {run_name}")

    # small values for debugging
    # loss_type = 'LeastSquares'
    loss_type = 'CrossEntropy'

    args.wandb = 0
    args.stats_steps = 10
    args.train_steps = 10
    args.stats_batch_size = 10
    args.data_width = 2
    args.targets_width = 2
    args.nonlin = False
    d1 = args.data_width**2
    d2 = 2
    d3 = args.targets_width**2

    d1 = args.data_width**2
    assert args.data_width == args.targets_width
    o = d1
    n = args.stats_batch_size
    d = [d1, 30, 30, 30, 20, 30, 30, 30, d1]

    if loss_type == 'CrossEntropy':
        d3 = 10
    o = d3
    n = args.stats_batch_size
    d = [d1, d2, d3]
    dsize = max(args.train_batch_size, args.stats_batch_size) + 1

    model = u.SimpleFullyConnected2(d, bias=True, nonlin=args.nonlin)
    model = model.to(gl.device)

    try:
        # os.environ['WANDB_SILENT'] = 'true'
        if args.wandb:
            wandb.init(project='curv_train_tiny', name=run_name)
            wandb.tensorboard.patch(tensorboardX=False)
            wandb.config['train_batch'] = args.train_batch_size
            wandb.config['stats_batch'] = args.stats_batch_size
            wandb.config['method'] = args.method
            wandb.config['n'] = n
    except Exception as e:
        print(f"wandb crash with {e}")

    # optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.03)  # make 10x smaller for least-squares loss
    dataset = u.TinyMNIST(data_width=args.data_width,
                          targets_width=args.targets_width,
                          dataset_size=dsize,
                          original_targets=True)

    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.train_batch_size,
        shuffle=False,
        drop_last=True)
    train_iter = u.infinite_iter(train_loader)

    stats_iter = None
    if not args.full_batch:
        stats_loader = torch.utils.data.DataLoader(
            dataset,
            batch_size=args.stats_batch_size,
            shuffle=False,
            drop_last=True)
        stats_iter = u.infinite_iter(stats_loader)

    test_dataset = u.TinyMNIST(data_width=args.data_width,
                               targets_width=args.targets_width,
                               train=False,
                               dataset_size=dsize,
                               original_targets=True)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.train_batch_size,
                                              shuffle=False,
                                              drop_last=True)
    test_iter = u.infinite_iter(test_loader)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    elif loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.token_count = 0
    last_outer = 0
    val_losses = []
    for step in range(args.stats_steps):
        if last_outer:
            u.log_scalars(
                {"time/outer": 1000 * (time.perf_counter() - last_outer)})
        last_outer = time.perf_counter()

        with u.timeit("val_loss"):
            test_data, test_targets = next(test_iter)
            test_output = model(test_data)
            val_loss = loss_fn(test_output, test_targets)
            # print("val_loss", val_loss.item())
            val_losses.append(val_loss.item())
            u.log_scalar(val_loss=val_loss.item())

        # compute stats
        if args.full_batch:
            data, targets = dataset.data, dataset.targets
        else:
            data, targets = next(stats_iter)

        # Capture Hessian and gradient stats
        autograd_lib.enable_hooks()
        autograd_lib.clear_backprops(model)
        autograd_lib.clear_hess_backprops(model)
        with u.timeit("backprop_g"):
            output = model(data)
            loss = loss_fn(output, targets)
            loss.backward(retain_graph=True)
        with u.timeit("backprop_H"):
            autograd_lib.backprop_hess(output, hess_type=loss_type)
        autograd_lib.disable_hooks()  # TODO(y): use remove_hooks

        with u.timeit("compute_grad1"):
            autograd_lib.compute_grad1(model)
        with u.timeit("compute_hess"):
            autograd_lib.compute_hess(model)

        for (i, layer) in enumerate(model.layers):

            # input/output layers are unreasonably expensive if not using Kronecker factoring
            if d[i] > 50 or d[i + 1] > 50:
                print(
                    f'layer {i} is too big ({d[i], d[i + 1]}), skipping stats')
                continue

            if args.skip_stats:
                continue

            s = AttrDefault(str, {})  # dictionary-like object for layer stats

            #############################
            # Gradient stats
            #############################
            A_t = layer.activations
            assert A_t.shape == (n, d[i])

            # add factor of n because backprop takes loss averaged over batch, while we need per-example loss
            B_t = layer.backprops_list[0] * n
            assert B_t.shape == (n, d[i + 1])

            with u.timeit(f"khatri_g-{i}"):
                G = u.khatri_rao_t(B_t, A_t)  # batch loss Jacobian
            assert G.shape == (n, d[i] * d[i + 1])
            g = G.sum(dim=0, keepdim=True) / n  # average gradient
            assert g.shape == (1, d[i] * d[i + 1])

            u.check_equal(G.reshape(layer.weight.grad1.shape),
                          layer.weight.grad1)

            if args.autograd_check:
                u.check_close(B_t.t() @ A_t / n, layer.weight.saved_grad)
                u.check_close(g.reshape(d[i + 1], d[i]),
                              layer.weight.saved_grad)

            s.sparsity = torch.sum(layer.output <= 0) / layer.output.numel(
            )  # proportion of activations that are zero
            s.mean_activation = torch.mean(A_t)
            s.mean_backprop = torch.mean(B_t)

            # empirical Fisher
            with u.timeit(f'sigma-{i}'):
                efisher = G.t() @ G / n
                sigma = efisher - g.t() @ g
                s.sigma_l2 = u.sym_l2_norm(sigma)
                s.sigma_erank = torch.trace(sigma) / s.sigma_l2

            lambda_regularizer = args.lmb * torch.eye(d[i + 1] * d[i]).to(
                gl.device)
            H = layer.weight.hess

            with u.timeit(f"invH-{i}"):
                invH = torch.cholesky_inverse(H + lambda_regularizer)

            with u.timeit(f"H_l2-{i}"):
                s.H_l2 = u.sym_l2_norm(H)
                s.iH_l2 = u.sym_l2_norm(invH)

            with u.timeit(f"norms-{i}"):
                s.H_fro = H.flatten().norm()
                s.iH_fro = invH.flatten().norm()
                s.grad_fro = g.flatten().norm()
                s.param_fro = layer.weight.data.flatten().norm()

            u.nan_check(H)
            if args.autograd_check:
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                H_autograd = u.hessian(loss, layer.weight)
                H_autograd = H_autograd.reshape(d[i] * d[i + 1],
                                                d[i] * d[i + 1])
                u.check_close(H, H_autograd)

            #  u.dump(sigma, f'/tmp/sigmas/H-{step}-{i}')
            def loss_direction(dd: torch.Tensor, eps):
                """loss improvement if we take step eps in direction dd"""
                return u.to_python_scalar(eps * (dd @ g.t()) -
                                          0.5 * eps**2 * dd @ H @ dd.t())

            def curv_direction(dd: torch.Tensor):
                """Curvature in direction dd"""
                return u.to_python_scalar(dd @ H @ dd.t() /
                                          (dd.flatten().norm()**2))

            with u.timeit(f"pinvH-{i}"):
                pinvH = H.pinverse()

            with u.timeit(f'curv-{i}'):
                s.grad_curv = curv_direction(g)
                ndir = g @ pinvH  # newton direction
                s.newton_curv = curv_direction(ndir)
                setattr(layer.weight, 'pre',
                        pinvH)  # save Newton preconditioner
                s.step_openai = s.grad_fro**2 / s.grad_curv if s.grad_curv else 999
                s.step_max = 2 / s.H_l2
                s.step_min = torch.tensor(2) / torch.trace(H)

                s.newton_fro = ndir.flatten().norm(
                )  # frobenius norm of Newton update
                s.regret_newton = u.to_python_scalar(
                    g @ pinvH @ g.t() / 2)  # replace with "quadratic_form"
                s.regret_gradient = loss_direction(g, s.step_openai)

            with u.timeit(f'rho-{i}'):
                p_sigma = u.lyapunov_spectral(H, sigma)

                discrepancy = torch.max(abs(p_sigma - p_sigma.t()) / p_sigma)

                s.psigma_erank = u.sym_erank(p_sigma)
                s.rho = H.shape[0] / s.psigma_erank

            with u.timeit(f"batch-{i}"):
                s.batch_openai = torch.trace(H @ sigma) / (g @ H @ g.t())
                s.diversity = torch.norm(G, "fro")**2 / torch.norm(g)**2 / n

                # Faster approaches for noise variance computation
                # s.noise_variance = torch.trace(H.inverse() @ sigma)
                # try:
                #     # this fails with singular sigma
                #     s.noise_variance = torch.trace(torch.solve(sigma, H)[0])
                #     # s.noise_variance = torch.trace(torch.lstsq(sigma, H)[0])
                #     pass
                # except RuntimeError as _:
                s.noise_variance_pinv = torch.trace(pinvH @ sigma)

                s.H_erank = torch.trace(H) / s.H_l2
                s.batch_jain_simple = 1 + s.H_erank
                s.batch_jain_full = 1 + s.rho * s.H_erank

            u.log_scalars(u.nest_stats(layer.name, s))

        # gradient steps
        with u.timeit('inner'):
            for i in range(args.train_steps):
                optimizer.zero_grad()
                data, targets = next(train_iter)
                model.zero_grad()
                output = model(data)
                loss = loss_fn(output, targets)
                loss.backward()

                #            u.log_scalar(train_loss=loss.item())

                if args.method != 'newton':
                    optimizer.step()
                    if args.weight_decay:
                        for group in optimizer.param_groups:
                            for param in group['params']:
                                param.data.mul_(1 - args.weight_decay)
                else:
                    for (layer_idx, layer) in enumerate(model.layers):
                        param: torch.nn.Parameter = layer.weight
                        param_data: torch.Tensor = param.data
                        param_data.copy_(param_data - 0.1 * param.grad)
                        if layer_idx != 1:  # only update 1 layer with Newton, unstable otherwise
                            continue
                        u.nan_check(layer.weight.pre)
                        u.nan_check(param.grad.flatten())
                        u.nan_check(
                            u.v2r(param.grad.flatten()) @ layer.weight.pre)
                        param_new_flat = u.v2r(param_data.flatten()) - u.v2r(
                            param.grad.flatten()) @ layer.weight.pre
                        u.nan_check(param_new_flat)
                        param_data.copy_(
                            param_new_flat.reshape(param_data.shape))

                gl.token_count += data.shape[0]

    gl.event_writer.close()

    assert val_losses[0] > 2.4  # 2.4828238487243652
    assert val_losses[-1] < 2.25  # 2.20609712600708
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()