def complex_train_test():

    np.random.seed(0)

    do_images = True

    train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte')
    dsize = 10000
    patches = train_images[:, :dsize]
    fs = [dsize, 28 * 28, 196, 28 * 28]
    cost, train_op = cost_and_grad(fs=fs,
                                   X0=patches,
                                   lambda_=3e-3,
                                   rho=0.1,
                                   beta=3,
                                   lr=0.1)

    sess = tf.get_default_session()

    u.reset_time()
    old_cost = sess.run(cost)
    old_i = 0
    frame_count = 0
    costs = []
    for i in range(2000):
        cost0, _ = sess.run([cost, train_op])
        costs.append(cost0)
        if i % 100 == 0:
            print(cost0)
            # filters are transposed in visualization
        if ((old_cost - cost0) / old_cost > 0.05
                or i - old_i > 50) and do_images:
            Wf_ = sess.run("Wf_var/read:0")
            W1_ = u.unflatten_np(Wf_, fs[1:])[0]
            display_network.display_network(W1_.T,
                                            filename="pics/weights-%03d.png" %
                                            (frame_count, ))
            frame_count += 1
            old_cost = cost0
            old_i = i
        u.record_time()

    #  u.dump(costs, "costs_adam.csv")
    u.dump(costs, "costs_adam_bn1.csv")
    u.summarize_time()
Beispiel #2
0
def simple_newton_kfac_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # inverse Hessian blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      # reuse Hess tensor calculation in order to get off-diag block sizes
      dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        acov = A[i] @ t(A[j])
        bcov = Bn[i] @ t(Bn[j]) / dsize;
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
      iblocks[i][j]=term
        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]
    
  ihess = u.concat_blocks(iblocks)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv",
                               delimiter= ",")
  observed_losses = []

  # from accompanying notebook
  #  {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, 
  # 6.88354*10^-33,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #3
0
def simple_newton_bd_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]

  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

  # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]
    
  #hess = u.concat_blocks(blocks)
  ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))
  #  ihess = u.pseudo_inverse(hess)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # from accompanying notebook
  # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, 
  # 1.21455*10^-32,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #4
0
def relu_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_relu_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [4,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0, name="X0")
  Y = tf.constant(Y0, name="Y0")
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    if i == 0:
      A[i+1] = X
    else:
      A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1)))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.1, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = (-err/dsize)*u.relu_mask(A[n+1])
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    if i > 0:  # there's no relu on first matrix
      B[i] = B[i]*u.relu_mask(A[i+1])

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # From accompanying notebook
  #  {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892,
  #    0.00156137, 0.00153857, 0.00153051, 0.00152593}
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #5
0
def simple_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))


  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(1.0, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  # from accompanying notebook
  # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361,
  #  0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772,
  for i in range(20):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #6
0
        if slope_ratio < alpha and abs(target_delta) > 1e-6 and adaptive_step:
            print("%.2f %.2f %.2f" % (cost0, cost1, slope_ratio))
            print("Slope optimality %.2f, shrinking learning rate to %.2f" % (
                slope_ratio,
                lr0 * beta,
            ))
            sess.run(lr_set, feed_dict={lr_p: lr0 * beta})
        else:
            # see if our learning rate got too conservative, and increase it
            # 99 was ideal for gradient
            #      if i>0 and i%50 == 0 and slope_ratio>0.99:
            if i > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step:
                print("%.2f %.2f %.2f" % (cost0, cost1, slope_ratio))
                print("Growing learning rate to %.2f" % (lr0 * growth_rate))
                sess.run(lr_set, feed_dict={lr_p: lr0 * growth_rate})

        if do_images and i > 0 and i % 100 == 0:
            Wf_ = sess.run("Wf_var/read:0")
            W1_ = u.unflatten_np(Wf_, fs[1:])[0]
            display_network.display_network(W1_.T,
                                            filename="pics/weights-%03d.png" %
                                            (frame_count, ))
            frame_count += 1
            old_cost = cost0
            old_i = i

        u.record_time()

    u.dump(costs, "new%d.csv" % (whitening_mode, ))
    u.summarize_time()
                     delimiter= ",")
  Y0 = np.genfromtxt('data/large_rotations2_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv',
                             delimiter= ","))
  fs = np.genfromtxt('data/large_rotations2_fs.csv',
                     delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]

  dsize = X0.shape[1]
  assert f(-1) == dsize

  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
def rotations2_natural_sampled_kfac(num_samples=1):
    tf.reset_default_graph()
    np.random.seed(0)
    tf.set_random_seed(0)

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A2 = [0] * (n + 2)  # augmented forward props for natural gradient
    A[0] = u.Identity(dsize)
    A2[0] = u.Identity(dsize * num_samples)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))
        if i == 0:
            # replicate dataset multiple times corresponding to number of samples
            A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1)
        else:
            A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)

    # lower learning rate by 10x
    lr = tf.Variable(0.01, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B2 = [0] * (n + 1)
    B[n] = -err / dsize
    B2[n] = tf.random_normal((f(n), dsize * num_samples),
                             0,
                             1,
                             seed=0,
                             dtype=dtype)
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))
        B2[i] = tf.matmul(tf.transpose(W[i + 1]),
                          B2[i + 1],
                          name="B2" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    dW2 = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
        dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i))

    del dW[0]  # get rid of W[0] update
    del dW2[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # todo: divide both activations and backprops by size for cov calc

    # Kronecker factored covariance blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            if i == j:
                acov = A2[i] @ t(A2[j]) / (dsize * num_samples)
                bcov = B2[i] @ t(B2[j]) / (dsize * num_samples)
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)),
                                dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ifisher = u.concat_blocks(iblocks)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
def rotations2_natural_empirical():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.000001, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))

    del dW[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # inverse fisher preconditioner
    grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)],
                      axis=0)
    fisher = grads @ tf.transpose(grads) / dsize
    ifisher = u.pseudo_inverse(fisher)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #10
0
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #11
0
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #12
0
def rotations1_gradient_test():
    #  https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c
    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=","))

    fs = np.genfromtxt('data/large_rotations1_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv')
    lr = tf.Variable(lr0, dtype=dtype)

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv",
                                 delimiter=",")
    observed_losses = []
    # from accompanying notebook
    # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379,
    for i in range(10):
        observed_losses.append(sess.run([loss])[0])
        sess.run(train_op1)
        sess.run(train_op2)

    u.check_equal(observed_losses, expected_losses)