def line_search(directionv, start, end, steps=10):
            """Takes steps between start and end, returns steps+1 loss entries"""
            param0 = param.data.clone()
            param0v = u.vec(param0).t()
            losses = []
            for i in range(steps + 1):
                output = model(
                    stats_data)  # use last saved data batch for backprop
                loss = compute_loss(output, stats_targets)
                losses.append(loss)
                offset = start + i * ((end - start) / steps)
                param1v = param0v + offset * directionv

                param1 = u.unvec(param1v.t(), param.data.shape[0])
                param.data.copy_(param1)

            output = model(
                stats_data)  # use last saved data batch for backprop
            loss = compute_loss(output, stats_targets)
            losses.append(loss)

            param.data.copy_(param0)
            return losses
Esempio n. 2
0
def relu_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_relu_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [4,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0, name="X0")
  Y = tf.constant(Y0, name="Y0")
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    if i == 0:
      A[i+1] = X
    else:
      A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1)))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.1, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = (-err/dsize)*u.relu_mask(A[n+1])
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    if i > 0:  # there's no relu on first matrix
      B[i] = B[i]*u.relu_mask(A[i+1])

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # From accompanying notebook
  #  {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892,
  #    0.00156137, 0.00153857, 0.00153051, 0.00152593}
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Esempio n. 3
0
def simple_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))


  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(1.0, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  # from accompanying notebook
  # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361,
  #  0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772,
  for i in range(20):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Esempio n. 4
0
  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)

  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update

  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  lr_holder = tf.placeholder(dtype=dtype, shape=())
  lr = tf.Variable(lr_holder, dtype=dtype)

  # run tests
  do_run_iters = 5
  result = newton(1.0)
  expected_result = [8.9023744225439743e-05, 0.060120791316053412, 0.0059295249954177918, 1.9856240803246437e-05, 2.7125563957575423e-10]
  u.check_equal(result, expected_result)
  
  # 720 ms per step
  # result = newton(1.0)
  # np.savetxt("data/newton.csv", result, delimiter=',')
  # sys.exit()
  # natural_empirical(0.000000002)
Esempio n. 5
0
def rotations2_natural_sampled_kfac(num_samples=1):
    tf.reset_default_graph()
    np.random.seed(0)
    tf.set_random_seed(0)

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A2 = [0] * (n + 2)  # augmented forward props for natural gradient
    A[0] = u.Identity(dsize)
    A2[0] = u.Identity(dsize * num_samples)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))
        if i == 0:
            # replicate dataset multiple times corresponding to number of samples
            A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1)
        else:
            A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)

    # lower learning rate by 10x
    lr = tf.Variable(0.01, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B2 = [0] * (n + 1)
    B[n] = -err / dsize
    B2[n] = tf.random_normal((f(n), dsize * num_samples),
                             0,
                             1,
                             seed=0,
                             dtype=dtype)
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))
        B2[i] = tf.matmul(tf.transpose(W[i + 1]),
                          B2[i + 1],
                          name="B2" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    dW2 = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
        dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i))

    del dW[0]  # get rid of W[0] update
    del dW2[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # todo: divide both activations and backprops by size for cov calc

    # Kronecker factored covariance blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            if i == j:
                acov = A2[i] @ t(A2[j]) / (dsize * num_samples)
                bcov = B2[i] @ t(B2[j]) / (dsize * num_samples)
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)),
                                dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ifisher = u.concat_blocks(iblocks)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Esempio n. 6
0
def rotations2_natural_empirical():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.000001, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))

    del dW[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # inverse fisher preconditioner
    grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)],
                      axis=0)
    fisher = grads @ tf.transpose(grads) / dsize
    ifisher = u.pseudo_inverse(fisher)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Esempio n. 7
0
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Esempio n. 8
0
def rotations1_gradient_test():
    #  https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c
    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=","))

    fs = np.genfromtxt('data/large_rotations1_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv')
    lr = tf.Variable(lr0, dtype=dtype)

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv",
                                 delimiter=",")
    observed_losses = []
    # from accompanying notebook
    # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379,
    for i in range(10):
        observed_losses.append(sess.run([loss])[0])
        sess.run(train_op1)
        sess.run(train_op2)

    u.check_equal(observed_losses, expected_losses)
Esempio n. 9
0
def test_explicit_hessian():
    """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf


    """

    torch.set_default_dtype(torch.float64)
    A = torch.tensor([[-1., 4], [3, 0]])
    B = torch.tensor([[-4., 3], [2, 6]])
    X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True)

    Y = B.t() @ X @ A
    u.check_equal(Y, [[-52, 64], [-81, -108]])
    loss = torch.sum(Y * Y) / 2
    hess0 = u.hessian(loss, X).reshape([4, 4])
    hess1 = u.Kron(A @ A.t(), B @ B.t())

    u.check_equal(loss, 12512.5)

    # PyTorch autograd computes Hessian with respect to row-vectorized parameters, whereas
    # autograd_lib uses math convention and does column-vectorized.
    # Commuting order of Kronecker product switches between two representations
    u.check_equal(hess1.commute(), hess0)

    # Do a test using Linear layers instead of matrix multiplies
    model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False)
    model.layers[0].weight.data.copy_(X)

    # Transpose to match previous results, layers treat dim0 as batch dimension
    u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]])  # XA = (A'X0)'

    model.layers[1].weight.data.copy_(B.t())
    u.check_equal(model(A.t()).t(), Y)

    Y = model(A.t()).t()    # transpose to data-dimension=columns
    loss = torch.sum(Y * Y) / 2
    loss.backward()

    u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]])
    G = B @ Y @ A.t()
    u.check_equal(model.layers[0].weight.grad, G)

    u.check_equal(hess0, u.Kron(B @ B.t(), A @ A.t()))

    # compute newton step
    u.check_equal(u.Kron([email protected](), [email protected]()).pinv() @ u.vec(G), u.v2c([-5, -2, 0, -6]))

    # compute Newton step using factored representation
    autograd_lib.add_hooks(model)

    Y = model(A.t())
    n = 2
    loss = torch.sum(Y * Y) / 2
    autograd_lib.backprop_hess(Y, hess_type='LeastSquares')
    autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum')
    param = model.layers[0].weight

    hess2 = param.hess_kron
    print(hess2)

    u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]])

    # Gradient test
    model.zero_grad()
    loss.backward()
    u.check_close(u.vec(G).flatten(), u.Vec(param.grad))

    # Newton step test
    # Method 0: PyTorch native autograd
    newton_step0 = param.grad.flatten() @ torch.pinverse(hess0)
    newton_step0 = newton_step0.reshape(param.shape)
    u.check_equal(newton_step0, [[-5, 0], [-2, -6]])

    # Method 1: colummn major order
    ihess2 = hess2.pinv()
    u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]])
    u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]])
    u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6])
    newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form()

    # Method2: row major order
    ihess2_rowmajor = ihess2.commute()
    newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad)
    newton_step2 = newton_step2.matrix_form()

    u.check_equal(newton_step0, newton_step1)
    u.check_equal(newton_step0, newton_step2)
Esempio n. 10
0
    def compute_layer_stats(layer):
        refreeze = False
        if hasattr(layer, 'frozen') and layer.frozen:
            u.unfreeze(layer)
            refreeze = True

        s = AttrDefault(str, {})
        n = args.stats_batch_size
        param = u.get_param(layer)
        _d = len(param.flatten())  # dimensionality of parameters
        layer_idx = model.layers.index(layer)
        # TODO: get layer type, include it in name
        assert layer_idx >= 0
        assert stats_data.shape[0] == n

        def backprop_loss():
            model.zero_grad()
            output = model(
                stats_data)  # use last saved data batch for backprop
            loss = compute_loss(output, stats_targets)
            loss.backward()
            return loss, output

        def backprop_output():
            model.zero_grad()
            output = model(stats_data)
            output.backward(gradient=torch.ones_like(output))
            return output

        # per-example gradients, n, d
        _loss, _output = backprop_loss()
        At = layer.data_input
        Bt = layer.grad_output * n
        G = u.khatri_rao_t(At, Bt)
        g = G.sum(dim=0, keepdim=True) / n
        u.check_close(g, u.vec(param.grad).t())

        s.diversity = torch.norm(G, "fro")**2 / g.flatten().norm()**2
        s.grad_fro = g.flatten().norm()
        s.param_fro = param.data.flatten().norm()
        pos_activations = torch.sum(layer.data_output > 0)
        neg_activations = torch.sum(layer.data_output <= 0)
        s.a_sparsity = neg_activations.float() / (
            pos_activations + neg_activations)  # 1 sparsity means all 0's
        activation_size = len(layer.data_output.flatten())
        s.a_magnitude = torch.sum(layer.data_output) / activation_size

        _output = backprop_output()
        B2t = layer.grad_output
        J = u.khatri_rao_t(At, B2t)  # batch output Jacobian
        H = J.t() @ J / n

        s.hessian_l2 = u.l2_norm(H)
        s.jacobian_l2 = u.l2_norm(J)
        J1 = J.sum(dim=0) / n  # single output Jacobian
        s.J1_l2 = J1.norm()

        # newton decrement
        def loss_direction(direction, eps):
            """loss improvement if we take step eps in direction dir"""
            return u.to_python_scalar(eps * (direction @ g.t()) - 0.5 *
                                      eps**2 * direction @ H @ direction.t())

        s.regret_newton = u.to_python_scalar(g @ u.pinv(H) @ g.t() / 2)

        # TODO: gradient diversity is stuck at 1
        # TODO: newton/gradient angle
        # TODO: newton step magnitude
        s.grad_curvature = u.to_python_scalar(
            g @ H @ g.t())  # curvature in direction of g
        s.step_openai = u.to_python_scalar(
            s.grad_fro**2 / s.grad_curvature) if s.grad_curvature else 999

        s.regret_gradient = loss_direction(g, s.step_openai)

        if refreeze:
            u.freeze(layer)
        return s
Esempio n. 11
0
def _test_explicit_hessian_refactored():

    """Check computation of hessian of loss(B'WA) from https://github.com/yaroslavvb/kfac_pytorch/blob/master/derivation.pdf


    """

    torch.set_default_dtype(torch.float64)
    A = torch.tensor([[-1., 4], [3, 0]])
    B = torch.tensor([[-4., 3], [2, 6]])
    X = torch.tensor([[-5., 0], [-2, -6]], requires_grad=True)

    Y = B.t() @ X @ A
    u.check_equal(Y, [[-52, 64], [-81, -108]])
    loss = torch.sum(Y * Y) / 2
    hess0 = u.hessian(loss, X).reshape([4, 4])
    hess1 = u.Kron(A @ A.t(), B @ B.t())

    u.check_equal(loss, 12512.5)

    # Do a test using Linear layers instead of matrix multiplies
    model: u.SimpleFullyConnected2 = u.SimpleFullyConnected2([2, 2, 2], bias=False)
    model.layers[0].weight.data.copy_(X)

    # Transpose to match previous results, layers treat dim0 as batch dimension
    u.check_equal(model.layers[0](A.t()).t(), [[5, -20], [-16, -8]])  # XA = (A'X0)'

    model.layers[1].weight.data.copy_(B.t())
    u.check_equal(model(A.t()).t(), Y)

    Y = model(A.t()).t()    # transpose to data-dimension=columns
    loss = torch.sum(Y * Y) / 2
    loss.backward()

    u.check_equal(model.layers[0].weight.grad, [[-2285, -105], [-1490, -1770]])
    G = B @ Y @ A.t()
    u.check_equal(model.layers[0].weight.grad, G)

    autograd_lib.register(model)
    activations_dict = autograd_lib.ModuleDict()  # todo(y): make save_activations ctx manager automatically create A
    with autograd_lib.save_activations(activations_dict):
        Y = model(A.t())

    Acov = autograd_lib.ModuleDict(autograd_lib.SecondOrderCov)
    for layer, activations in activations_dict.items():
        print(layer, activations)
        Acov[layer].accumulate(activations, activations)
    autograd_lib.set_default_activations(activations_dict)
    autograd_lib.set_default_Acov(Acov)

    B = autograd_lib.ModuleDict(autograd_lib.SymmetricFourthOrderCov)
    autograd_lib.backward_accum(Y, "identity", B, retain_graph=False)

    print(B[model.layers[0]])

    autograd_lib.backprop_hess(Y, hess_type='LeastSquares')
    autograd_lib.compute_hess(model, method='kron', attr_name='hess_kron', vecr_order=False, loss_aggregation='sum')
    param = model.layers[0].weight

    hess2 = param.hess_kron
    print(hess2)

    u.check_equal(hess2, [[425, 170, -75, -30], [170, 680, -30, -120], [-75, -30, 225, 90], [-30, -120, 90, 360]])

    # Gradient test
    model.zero_grad()
    loss.backward()
    u.check_close(u.vec(G).flatten(), u.Vec(param.grad))

    # Newton step test
    # Method 0: PyTorch native autograd
    newton_step0 = param.grad.flatten() @ torch.pinverse(hess0)
    newton_step0 = newton_step0.reshape(param.shape)
    u.check_equal(newton_step0, [[-5, 0], [-2, -6]])

    # Method 1: colummn major order
    ihess2 = hess2.pinv()
    u.check_equal(ihess2.LL, [[1/16, 1/48], [1/48, 17/144]])
    u.check_equal(ihess2.RR, [[2/45, -(1/90)], [-(1/90), 1/36]])
    u.check_equal(torch.flatten(hess2.pinv() @ u.vec(G)), [-5, -2, 0, -6])
    newton_step1 = (ihess2 @ u.Vec(param.grad)).matrix_form()

    # Method2: row major order
    ihess2_rowmajor = ihess2.commute()
    newton_step2 = ihess2_rowmajor @ u.Vecr(param.grad)
    newton_step2 = newton_step2.matrix_form()

    u.check_equal(newton_step0, newton_step1)
    u.check_equal(newton_step0, newton_step2)
Esempio n. 12
0
def test_factored_hessian():
    """"Simple test to ensure Hessian computation is working.

    In a linear neural network with squared loss, Newton step will converge in one step.
    Compute stats after minimizing, pass sanity checks.
    """

    u.seed_random(1)
    loss_type = 'LeastSquares'

    data_width = 2
    n = 5
    d1 = data_width ** 2
    o = 10
    d = [d1, o]

    model = u.SimpleFullyConnected2(d, bias=False, nonlin=False)
    model = model.to(gl.device)
    print(model)

    dataset = u.TinyMNIST(data_width=data_width, dataset_size=n, loss_type=loss_type)
    stats_loader = torch.utils.data.DataLoader(dataset, batch_size=n, shuffle=False)
    stats_iter = u.infinite_iter(stats_loader)
    stats_data, stats_targets = next(stats_iter)

    if loss_type == 'LeastSquares':
        loss_fn = u.least_squares
    else:  # loss_type == 'CrossEntropy':
        loss_fn = nn.CrossEntropyLoss()

    autograd_lib.add_hooks(model)
    gl.reset_global_step()
    last_outer = 0

    data, targets = stats_data, stats_targets

    # Capture Hessian and gradient stats
    autograd_lib.enable_hooks()
    autograd_lib.clear_backprops(model)

    output = model(data)
    loss = loss_fn(output, targets)
    print(loss)
    loss.backward(retain_graph=True)
    layer = model.layers[0]

    autograd_lib.clear_hess_backprops(model)
    autograd_lib.backprop_hess(output, hess_type=loss_type)
    autograd_lib.disable_hooks()

    # compute Hessian using direct method, compare against PyTorch autograd
    hess0 = u.hessian(loss, layer.weight)
    autograd_lib.compute_hess(model)
    hess1 = layer.weight.hess
    print(hess1)
    u.check_close(hess0.reshape(hess1.shape), hess1, atol=1e-9, rtol=1e-6)

    # compute Hessian using factored method
    autograd_lib.compute_hess(model, method='kron', attr_name='hess2', vecr_order=True)
    # s.regret_newton = vecG.t() @ pinvH.commute() @ vecG.t() / 2  # TODO(y): figure out why needed transposes

    hess2 = layer.weight.hess2
    u.check_close(hess1, hess2, atol=1e-9, rtol=1e-6)

    # Newton step in regular notation
    g1 = layer.weight.grad.flatten()
    newton1 = hess1 @ g1

    g2 = u.Vecr(layer.weight.grad)
    newton2 = g2 @ hess2

    u.check_close(newton1, newton2, atol=1e-9, rtol=1e-6)

    # compute regret in factored notation, compare against actual drop in loss
    regret1 = g1 @ hess1.pinverse() @ g1 / 2
    regret2 = g2 @ hess2.pinv() @ g2 / 2
    u.check_close(regret1, regret2)

    current_weight = layer.weight.detach().clone()
    param: torch.nn.Parameter = layer.weight
    # param.data.sub_((hess1.pinverse() @ g1).reshape(param.shape))
    # output = model(data)
    # loss = loss_fn(output, targets)
    # print("result 1", loss)

    # param.data.sub_((hess1.pinverse() @ u.vec(layer.weight.grad)).reshape(param.shape))
    # output = model(data)
    # loss = loss_fn(output, targets)
    # print("result 2", loss)

    # param.data.sub_((u.vec(layer.weight.grad).t() @ hess1.pinverse()).reshape(param.shape))
    # output = model(data)
    # loss = loss_fn(output, targets)
    # print("result 3", loss)
    #

    del layer.weight.grad
    output = model(data)
    loss = loss_fn(output, targets)
    loss.backward()
    param.data.sub_(u.unvec(hess1.pinverse() @ u.vec(layer.weight.grad), layer.weight.shape[0]))
    output = model(data)
    loss = loss_fn(output, targets)
    print("result 4", loss)

    # param.data.sub_((g1 @ hess1.pinverse() @ g1).reshape(param.shape))

    print(loss)
Esempio n. 13
0
def test_kron():
    """Test kron, vec and vecr identities"""
    torch.set_default_dtype(torch.float64)
    a = torch.tensor([1, 2, 3, 4]).reshape(2, 2)
    b = torch.tensor([5, 6, 7, 8]).reshape(2, 2)
    u.check_close(u.Kron(a, b).trace(), 65)

    a = torch.tensor([[2., 7, 9], [1, 9, 8], [2, 7, 5]])
    b = torch.tensor([[6., 6, 1], [10, 7, 7], [7, 10, 10]])
    Ck = u.Kron(a, b)
    u.check_close(a.flatten().norm() * b.flatten().norm(), Ck.frobenius_norm())

    u.check_close(Ck.frobenius_norm(), 4 * math.sqrt(11635.))

    Ci = [[
        0, 5 / 102, -(7 / 204), 0, -(70 / 561), 49 / 561, 0, 125 / 1122,
        -(175 / 2244)
    ],
          [
              1 / 20, -(53 / 1020), 8 / 255, -(7 / 55), 371 / 2805,
              -(224 / 2805), 5 / 44, -(265 / 2244), 40 / 561
          ],
          [
              -(1 / 20), 3 / 170, 3 / 170, 7 / 55, -(42 / 935), -(42 / 935),
              -(5 / 44), 15 / 374, 15 / 374
          ],
          [
              0, -(5 / 102), 7 / 204, 0, 20 / 561, -(14 / 561), 0, 35 / 1122,
              -(49 / 2244)
          ],
          [
              -(1 / 20), 53 / 1020, -(8 / 255), 2 / 55, -(106 / 2805),
              64 / 2805, 7 / 220, -(371 / 11220), 56 / 2805
          ],
          [
              1 / 20, -(3 / 170), -(3 / 170), -(2 / 55), 12 / 935, 12 / 935,
              -(7 / 220), 21 / 1870, 21 / 1870
          ], [0, 5 / 102, -(7 / 204), 0, 0, 0, 0, -(5 / 102), 7 / 204],
          [
              1 / 20, -(53 / 1020), 8 / 255, 0, 0, 0, -(1 / 20), 53 / 1020,
              -(8 / 255)
          ],
          [
              -(1 / 20), 3 / 170, 3 / 170, 0, 0, 0, 1 / 20, -(3 / 170),
              -(3 / 170)
          ]]
    C = Ck.expand()
    C0 = u.to_numpy(C)
    Ci = torch.tensor(Ci)
    u.check_close(C @ Ci @ C, C)

    u.check_close(Ck.inv().expand(), torch.inverse(Ck.expand()))
    u.check_close(Ck.inv().expand_vec(), torch.inverse(Ck.expand_vec()))
    u.check_close(Ck.pinv().expand(), torch.pinverse(Ck.expand()))

    u.check_close(linalg.pinv(C0), Ci, rtol=1e-5, atol=1e-6)
    u.check_close(torch.pinverse(C), Ci, rtol=1e-5, atol=1e-6)
    u.check_close(Ck.inv().expand(), Ci, rtol=1e-5, atol=1e-6)
    u.check_close(Ck.pinv().expand(), Ci, rtol=1e-5, atol=1e-6)

    Ck2 = u.Kron(b, 2 * a)
    u.check_close((Ck @ Ck2).expand(), Ck.expand() @ Ck2.expand())
    u.check_close((Ck @ Ck2).expand_vec(), Ck.expand_vec() @ Ck2.expand_vec())

    d2 = 3
    d1 = 2
    G = torch.randn(d2, d1)
    g = u.vec(G)
    H = u.Kron(u.random_cov(d1), u.random_cov(d2))

    Gt = G.t()
    gt = g.reshape(1, -1)

    vecX = u.Vec([1, 2, 3, 4], shape=(2, 2))
    K = u.Kron([[5, 6], [7, 8]], [[9, 10], [11, 12]])

    u.check_equal(vecX @ K, [644, 706, 748, 820])
    u.check_equal(K @ vecX, [543, 655, 737, 889])

    u.check_equal(u.matmul(vecX @ K, vecX), 7538)
    u.check_equal(vecX @ (vecX @ K), 7538)
    u.check_equal(vecX @ vecX, 30)

    vecX = u.Vec([1, 2], shape=(1, 2))
    K = u.Kron([[5]], [[9, 10], [11, 12]])

    u.check_equal(vecX.norm()**2, 5)

    # check kronecker rules
    X = torch.tensor([[1., 2], [3, 4]])
    A = torch.tensor([[5., 6], [7, 8]])
    B = torch.tensor([[9., 10], [11, 12]])
    x = u.Vec(X)

    # kron/vec/vecr identities
    u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t()))
    u.check_equal(u.Vec(A @ X @ B), u.Kron(B.t(), A) @ x)
    u.check_equal(u.Vecr(A @ X @ B), u.Kron(A, B.t()) @ u.Vecr(X))
    u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X) @ u.Kron(A.t(), B))

    def extra_checks(A, X, B):
        x = u.Vec(X)
        u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t()))
        u.check_equal(u.Vec(A @ X @ B), u.Kron(B.t(), A) @ x)
        u.check_equal(u.Vecr(A @ X @ B), u.Kron(A, B.t()) @ u.Vecr(X))
        u.check_equal(u.Vecr(A @ X @ B), u.Vecr(X) @ u.Kron(A.t(), B))
        u.check_equal(u.Vecr(A @ X @ B),
                      u.Vecr(X) @ u.Kron(A.t(), B).normal_form())
        u.check_equal(u.Vecr(A @ X @ B),
                      u.matmul(u.Kron(A, B.t()).normal_form(), u.Vecr(X)))
        u.check_equal(u.Vec(A @ X @ B),
                      u.matmul(u.Kron(B.t(), A).normal_form(), x))
        u.check_equal(u.Vec(A @ X @ B), x @ u.Kron(B, A.t()).normal_form())
        u.check_equal(u.Vec(A @ X @ B),
                      x.normal_form() @ u.Kron(B, A.t()).normal_form())
        u.check_equal(u.Vec(A @ X @ B),
                      u.Kron(B.t(), A).normal_form() @ x.normal_form())
        u.check_equal(u.Vecr(A @ X @ B),
                      u.Kron(A, B.t()).normal_form() @ u.Vecr(X).normal_form())
        u.check_equal(u.Vecr(A @ X @ B),
                      u.Vecr(X).normal_form() @ u.Kron(A.t(), B).normal_form())

    # shape checks
    d1, d2 = 3, 4
    extra_checks(torch.ones((d1, d1)), torch.ones((d1, d2)),
                 torch.ones((d2, d2)))

    A = torch.rand(d1, d1)
    B = torch.rand(d2, d2)
    #x = torch.rand((d1*d2))
    #X = x.t().reshape(d1, d2)
    # X = torch.rand((d1, d2))
    # x = u.vec(X)
    x = torch.rand((d1 * d2))
Esempio n. 14
0
def simple_newton_bd_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]

  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

  # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]
    
  #hess = u.concat_blocks(blocks)
  ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))
  #  ihess = u.pseudo_inverse(hess)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # from accompanying notebook
  # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, 
  # 1.21455*10^-32,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Esempio n. 15
0
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Esempio n. 16
0
def simple_newton_kfac_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # inverse Hessian blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      # reuse Hess tensor calculation in order to get off-diag block sizes
      dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        acov = A[i] @ t(A[j])
        bcov = Bn[i] @ t(Bn[j]) / dsize;
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
      iblocks[i][j]=term
        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]
    
  ihess = u.concat_blocks(iblocks)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv",
                               delimiter= ",")
  observed_losses = []

  # from accompanying notebook
  #  {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, 
  # 6.88354*10^-33,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
    def compute_layer_stats(layer):
        stats = AttrDefault(str, {})
        n = stats_batch_size
        param = u.get_param(layer)
        d = len(param.flatten())
        layer_idx = model.layers.index(layer)
        assert layer_idx >= 0
        assert stats_data.shape[0] == n

        def backprop_loss():
            model.zero_grad()
            output = model(
                stats_data)  # use last saved data batch for backprop
            loss = compute_loss(output, stats_targets)
            loss.backward()
            return loss, output

        def backprop_output():
            model.zero_grad()
            output = model(stats_data)
            output.backward(gradient=torch.ones_like(output))
            return output

        # per-example gradients, n, d
        loss, output = backprop_loss()
        At = layer.data_input
        Bt = layer.grad_output * n
        G = u.khatri_rao_t(At, Bt)
        g = G.sum(dim=0, keepdim=True) / n
        u.check_close(g, u.vec(param.grad).t())

        stats.diversity = torch.norm(G, "fro")**2 / g.flatten().norm()**2

        stats.gradient_norm = g.flatten().norm()
        stats.parameter_norm = param.data.flatten().norm()
        pos_activations = torch.sum(layer.data_output > 0)
        neg_activations = torch.sum(layer.data_output <= 0)
        stats.sparsity = pos_activations.float() / (pos_activations +
                                                    neg_activations)

        output = backprop_output()
        At2 = layer.data_input
        u.check_close(At, At2)
        B2t = layer.grad_output
        J = u.khatri_rao_t(At, B2t)
        H = J.t() @ J / n

        model.zero_grad()
        output = model(stats_data)  # use last saved data batch for backprop
        loss = compute_loss(output, stats_targets)
        hess = u.hessian(loss, param)

        hess = hess.transpose(2, 3).transpose(0, 1).reshape(d, d)
        u.check_close(hess, H)
        u.check_close(hess, H)

        stats.hessian_norm = u.l2_norm(H)
        stats.jacobian_norm = u.l2_norm(J)
        Joutput = J.sum(dim=0) / n
        stats.jacobian_sensitivity = Joutput.norm()

        # newton decrement
        stats.loss_newton = u.to_python_scalar(g @ u.pinv(H) @ g.t() / 2)
        u.check_close(stats.loss_newton, loss)

        # do line-search to find optimal step
        def line_search(directionv, start, end, steps=10):
            """Takes steps between start and end, returns steps+1 loss entries"""
            param0 = param.data.clone()
            param0v = u.vec(param0).t()
            losses = []
            for i in range(steps + 1):
                output = model(
                    stats_data)  # use last saved data batch for backprop
                loss = compute_loss(output, stats_targets)
                losses.append(loss)
                offset = start + i * ((end - start) / steps)
                param1v = param0v + offset * directionv

                param1 = u.unvec(param1v.t(), param.data.shape[0])
                param.data.copy_(param1)

            output = model(
                stats_data)  # use last saved data batch for backprop
            loss = compute_loss(output, stats_targets)
            losses.append(loss)

            param.data.copy_(param0)
            return losses

        # try to take a newton step
        gradv = g
        line_losses = line_search(-gradv @ u.pinv(H), 0, 2, steps=10)
        u.check_equal(line_losses[0], loss)
        u.check_equal(line_losses[6], 0)
        assert line_losses[5] > line_losses[6]
        assert line_losses[7] > line_losses[6]
        return stats