Exemple #1
0
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(1)
    tf.set_random_seed(1)

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2
    train_images = np.asarray([[0, 1], [2, 3]]).astype(dtype)
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = np.asarray([[0., 1], [2, 3]]).astype(dtype) / 10
    W1_0 = np.asarray([[4., 5], [6, 7]]).astype(dtype) / 10
    W0f = u.flatten([W0_0, W1_0])
    Wf = tf.constant(W0f)

    losses = []
    for step in range(10):
        loss, output, grad, kfac_grad = loss_and_output_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        u.record_time()

    u.summarize_time()
    target = 1.252017617  # without random sampling
    target = 1.256854534  # with random sampling but fixed seed
    target = 0.000359572  # with random sampling and linear
    target = 1.251557469  # with random sampling

    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
Exemple #2
0
def main():
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    images = torch.Tensor(u.get_mnist_images().T)
    images = images[:args.batch_size]
    if args.cuda:
        images = images.cuda()
    data = Variable(images)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.encoder = nn.Linear(args.visible_size,
                                     args.hidden_size,
                                     bias=False)
            self.decoder = nn.Linear(args.hidden_size,
                                     args.visible_size,
                                     bias=False)

        def forward(self, input):
            x = input.view(-1, args.visible_size)
            x = self.encoder(x)
            x = F.sigmoid(x)
            x = self.decoder(x)
            x = F.sigmoid(x)
            return x.view_as(input)

    # initialize model and weights
    model = Net()
    params1, params2 = list(model.parameters())
    params1.data = torch.Tensor(
        u.ng_init(args.visible_size, args.hidden_size).T)
    params2.data = torch.Tensor(
        u.ng_init(args.hidden_size, args.visible_size).T)
    if args.cuda:
        model.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    for step in range(args.iters):
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data)
        loss0 = loss.data[0]
        loss.backward()
        optimizer.step()

        print("Step %3d loss %6.5f" % (step, loss0))
        u.record_time()

    u.summarize_time()
def do_run(train_op):
  sess = setup_session()
  observed_losses = []
  u.reset_time()
  for i in range(do_run_iters):
    loss0 = sess.run(loss)
    print(loss0)
    observed_losses.append(loss0)
    sess.run(train_op)
    u.record_time()
  u.summarize_time()
  return observed_losses
Exemple #4
0
 def closure():
     global step, final_loss
     optimizer.zero_grad()
     output = model(data)
     loss = F.mse_loss(output, data)
     if verbose:
         loss0 = loss.data[0]
         print("Step %3d loss %6.5f msec %6.3f" %
               (step, loss0, u.last_time()))
     step += 1
     if step == iters:
         final_loss = loss.data[0]
     loss.backward()
     u.record_time()
     return loss
Exemple #5
0
 def closure():
   global step, final_loss
   optimizer.zero_grad()
   output = model(data)
   loss = F.mse_loss(output, data)
   if verbose:
     loss0 = loss.data[0]
     times.append(u.last_time())
     print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time()))
   step+=1
   if step == iters:
     final_loss = loss.data[0]
   loss.backward()
   u.record_time()
   return loss
def benchmark_execute(dims, iters, dtype):
    A = tf.random_uniform((dims, dims), dtype=dtype)
    B = tf.random_uniform((dims, dims), dtype=dtype)
    prods = []
    for i in range(iters):
        prods.append(u.khatri_rao(A, B))
    elapsed_times = []
    sess = tf.Session()
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        time0 = time.time()
        sess.run(tf.group(*prods))
        elapsed_times.append(time.time() - time0)
        u.record_time()
    u.summarize_time()
def benchmark_execute(dims, iters, dtype):
  A = tf.random_uniform((dims, dims), dtype=dtype)
  B = tf.random_uniform((dims, dims), dtype=dtype)
  prods = []
  for i in range(iters):
    prods.append(u.khatri_rao(A,B))
  elapsed_times = []
  sess = tf.Session()
  elapsed_times = []
  u.reset_time()
  for i in range(10):
    time0 = time.time()
    sess.run(tf.group(*prods))
    elapsed_times.append(time.time()-time0)
    u.record_time()
  u.summarize_time()
def complex_train_test():

    np.random.seed(0)

    do_images = True

    train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte')
    dsize = 10000
    patches = train_images[:, :dsize]
    fs = [dsize, 28 * 28, 196, 28 * 28]
    cost, train_op = cost_and_grad(fs=fs,
                                   X0=patches,
                                   lambda_=3e-3,
                                   rho=0.1,
                                   beta=3,
                                   lr=0.1)

    sess = tf.get_default_session()

    u.reset_time()
    old_cost = sess.run(cost)
    old_i = 0
    frame_count = 0
    costs = []
    for i in range(2000):
        cost0, _ = sess.run([cost, train_op])
        costs.append(cost0)
        if i % 100 == 0:
            print(cost0)
            # filters are transposed in visualization
        if ((old_cost - cost0) / old_cost > 0.05
                or i - old_i > 50) and do_images:
            Wf_ = sess.run("Wf_var/read:0")
            W1_ = u.unflatten_np(Wf_, fs[1:])[0]
            display_network.display_network(W1_.T,
                                            filename="pics/weights-%03d.png" %
                                            (frame_count, ))
            frame_count += 1
            old_cost = cost0
            old_i = i
        u.record_time()

    #  u.dump(costs, "costs_adam.csv")
    u.dump(costs, "costs_adam_bn1.csv")
    u.summarize_time()
Exemple #9
0
def main():
  tf.set_random_seed(args.seed)
  np.random.seed(args.seed)
  
  images = tf.constant(u.get_mnist_images().T)
  images = images[:args.batch_size]
  if args.cuda:
    images = images.as_gpu_tensor()
  data = images

  if args.cuda:
    device='/gpu:0'
  else:
    device=''

  with tf.device(device):
    encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False,
                            activation=tf.sigmoid)
    decoder = tf.layers.Dense(units=args.visible_size, use_bias=False,
                              activation=tf.sigmoid)
    def loss_fn(inputs):
      predictions = decoder(encoder(inputs))
      return tf.reduce_mean(tf.square(predictions-inputs))
    value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)

    # initialize weights
    loss_fn(data)
    params1 = encoder.weights[0]
    params2 = decoder.weights[0]
    params1.assign(u.ng_init(args.visible_size, args.hidden_size))
    params2.assign(u.ng_init(args.hidden_size, args.visible_size))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr)
    for step in range(args.iters):
      value, grads_and_vars = value_and_gradients_fn(data)
      optimizer.apply_gradients(grads_and_vars)

      print("Step %3d loss %6.5f"%(step, value.numpy()))
      u.record_time()

    u.summarize_time()
Exemple #10
0
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(0)
    tf.set_random_seed(0)

    train_images = u.get_mnist_images()
    dsize = 1000
    fs = [dsize, 28 * 28, 196, 28 * 28]  # layer sizes
    lambda_ = 3e-3

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])
    Wf = tf.constant(W0f)
    assert Wf.dtype == tf.float32
    lr = tf.constant(0.2)

    losses = []
    for step in range(10):
        loss, grad, kfac_grad = loss_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %d loss %.2f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        if step >= 4:
            assert loss < 17.6
        u.record_time()

    u.summarize_time()
    assert losses[-1] < 0.8
    assert losses[-1] > 0.78
    assert 20e-3 < min(u.global_time_list) < 120e-3
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    if args.cuda:
        device = '/gpu:0'
    else:
        device = '/cpu:0'
    device_context = tf.device(device)
    device_context.__enter__()

    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0, W1_0])
    Wf = tf.constant(W0f)
    assert Wf.dtype == tf.float32
    lr = tf.constant(0.2)

    losses = []
    for step in range(40):
        loss, grad, kfac_grad = loss_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        if step >= 4:
            assert loss < 17.6
        u.record_time()

    u.summarize_time()
    assert losses[-1] < 0.59
    assert losses[-1] > 0.57
    assert 20e-3 < min(
        u.global_time_list) < 50e-3, "Time should be 30ms on 1080"
def main():
  global forward_list, backward_list, DO_PRINT
  
  torch.manual_seed(args.seed)
  np.random.seed(args.seed)
  if args.cuda:
    torch.cuda.manual_seed(args.seed)
  data0 = np.array([[0., 1], [2, 3]]).astype(dtype)
  data = Variable(torch.from_numpy(data0))

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      W0 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10
      W1 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10
      self.W0 = nn.Parameter(torch.from_numpy(W0))
      self.W1 = nn.Parameter(torch.from_numpy(W1))

    def forward(self, input):
      x = input.view(-1, 2)
      x = nonlin(my_matmul(self.W0, x))
      x = nonlin(my_matmul(self.W1, x))
      return x.view_as(input)

  model = Net()
  if args.cuda:
    model.cuda()
  
  model.train()
  optimizer = optim.SGD(model.parameters(), lr=lr)
  losses = []
  for step in range(10):
    optimizer.zero_grad()
    forward_list = []
    backward_list = []
    output = model(data)
    err = output-data
    loss = torch.sum(err*err)/2/dsize
    loss.backward(retain_graph=True)
    loss0 = loss.data[0]

    A = forward_list[:]
    B = backward_list[::-1]
    forward_list = []
    backward_list = []
    
    noise = torch.from_numpy(np.random.randn(*data.data.shape).astype(dtype))
    synthetic_data = Variable(output.data+noise)
    err2 = output - synthetic_data
    loss2 = torch.sum(err2*err2)/2/dsize
    optimizer.zero_grad()
    backward_list = []
    loss2.backward()
    B2 = backward_list[::-1]


    # compute whitened gradient
    pre_dW = []
    n = len(A)
    assert len(B) == n
    assert len(B2) == n
    for i in range(n):
      covA = A[i] @ t(A[i])/dsize
      covB2 = B2[i]@t(B2[i])/dsize
      covB = B[i]@t(B[i])/dsize
      covA_inv = regularized_inverse(covA)
      whitened_A = regularized_inverse(covA)@A[i]
      whitened_B = regularized_inverse(covB2.data)@B[i].data
      pre_dW.append(whitened_B @ t(whitened_A)/dsize)

    params = list(model.parameters())
    assert len(params) == len(pre_dW)
    for i in range(len(params)):
      params[i].data-=lr*pre_dW[i]
    
    print("Step %3d loss %10.9f"%(step, loss0))
    u.record_time()

  target = 1.251557469
  assert abs(loss0-target)<1e-9, abs(loss0-target)
  u.summarize_time()
Exemple #13
0
def main():
    np.random.seed(0)
    tf.set_random_seed(0)

    dtype = np.float32
    # 64-bit doesn't help much, search for 64-bit in
    # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4
    u.default_dtype = dtype
    machine_epsilon = np.finfo(dtype).eps  # 1e-7 or 1e-16
    train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte')
    dsize = 10000
    patches = train_images[:, :dsize]
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    X0 = patches
    lambda_ = 3e-3
    rho = tf.constant(0.1, dtype=dtype)
    beta = 3
    W0f = W_uniform(fs[2], fs[3])

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    # helper to create variables with numpy or TF initial value
    init_dict = {}  # {var_placeholder: init_value}
    vard = {}  # {var: util.VarInfo}

    def init_var(val, name, trainable=False, noinit=False):
        if isinstance(val, tf.Tensor):
            collections = [] if noinit else None
            var = tf.Variable(val, name=name, collections=collections)
        else:
            val = np.array(val)
            assert u.is_numeric, "Unknown type"
            holder = tf.placeholder(dtype,
                                    shape=val.shape,
                                    name=name + "_holder")
            var = tf.Variable(holder, name=name, trainable=trainable)
            init_dict[holder] = val
        var_p = tf.placeholder(var.dtype, var.shape)
        var_setter = var.assign(var_p)
        vard[var] = u.VarInfo(var_setter, var_p)
        return var

    lr = init_var(0.2, "lr")
    if purely_linear:  # need lower LR without sigmoids
        lr = init_var(.02, "lr")

    Wf = init_var(W0f, "Wf", True)
    Wf_copy = init_var(W0f, "Wf_copy", True)
    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    X = init_var(X0, "X")
    W.insert(0, X)

    def sigmoid(x):
        if not purely_linear:
            return tf.sigmoid(x)
        else:
            return tf.identity(x)

    def d_sigmoid(y):
        if not purely_linear:
            return y * (1 - y)
        else:
            return 1

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)

    # A[0] is just for shape checks, assert fail on run
    # tf.assert always fails because of static assert
    # fail_node = tf.assert_equal(1, 0, message="too huge")
    fail_node = tf.Print(0, [0], "fail, this must never run")
    with tf.control_dependencies([fail_node]):
        A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = backprops from sampled labels needed for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              noinit=True)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        if i == 1 and not drop_sparsity:
            backprop += beta * d_kl(rho, rho_hat)
            backprop2 += beta * d_kl(rho, rho_hat)
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    pre_dW_stable = [None] * (n + 1)  # preconditioned stable dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    for i in range(1, n + 1):
        cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, ))
        cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ))
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ))
        if use_tikhonov:
            whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i]
        else:
            whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i]
        if use_tikhonov:
            whitened_B2 = u.regularized_inverse2(vars_svd_B2[i],
                                                 L=Lambda) @ B[i]
        else:
            whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i]
        whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i]
        whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i]
        pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize
        pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Loss function
    reconstruction = u.L2(err) / (2 * dsize)
    sparsity = beta * tf.reduce_sum(kl(rho, rho_hat))
    L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1]))

    loss = reconstruction
    if not drop_l2:
        loss = loss + L2
    if not drop_sparsity:
        loss = loss + sparsity

    grad_live = u.flatten(dW[1:])
    pre_grad_live = u.flatten(pre_dW[1:])  # fisher preconditioned gradient
    pre_grad_stable_live = u.flatten(
        pre_dW_stable[1:])  # sqrt fisher preconditioned grad
    grad = init_var(grad_live, "grad")
    pre_grad = init_var(pre_grad_live, "pre_grad")
    pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable")

    update_params_op = Wf.assign(Wf - lr * pre_grad).op
    update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op
    save_params_op = Wf_copy.assign(Wf).op
    pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad)
    pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad)
    grad_norm = tf.reduce_sum(grad * grad)
    pre_grad_norm = u.L2(pre_grad)
    pre_grad_stable_norm = u.L2(pre_grad_stable)

    def dump_svd_info(step):
        """Dump singular values and gradient values in those coordinates."""
        for i in range(1, n + 1):
            svd = vars_svd_A[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            util.dump(s0, "A_%d_%d" % (i, step))
            A0 = A[i].eval()
            At0 = v0.T @ A0
            util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step))
            util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step))
            util.dump(s0, "As_%d_%d" % (i, step))

        for i in range(1, n + 1):
            svd = vars_svd_B2[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            util.dump(s0, "B2_%d_%d" % (i, step))
            B0 = B[i].eval()
            Bt0 = v0.T @ B0
            util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step))
            util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step))
            util.dump(s0, "Bs_%d_%d" % (i, step))

    def advance_batch():
        sess.run(sampled_labels.initializer)  # new labels for next call

    def update_covariances():
        ops_A = [cov_A[i].initializer for i in range(1, n + 1)]
        ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)]
        sess.run(ops_A + ops_B2)

    def update_svds():
        if whitening_mode > 1:
            vars_svd_A[2].update()
        if whitening_mode > 2:
            vars_svd_B2[2].update()
        if whitening_mode > 3:
            vars_svd_B2[1].update()

    def init_svds():
        """Initialize our SVD to identity matrices."""
        ops = []
        for i in range(1, n + 1):
            ops.extend(vars_svd_A[i].init_ops)
            ops.extend(vars_svd_B2[i].init_ops)
        sess = tf.get_default_session()
        sess.run(ops)

    init_op = tf.global_variables_initializer()
    #  tf.get_default_graph().finalize()

    from tensorflow.core.protobuf import rewriter_config_pb2

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                    rewrite_options=rewrite_options)
    config = tf.ConfigProto(graph_options=graph_options)
    #sess = tf.Session(config=config)
    sess = tf.InteractiveSession(config=config)
    sess.run(Wf.initializer, feed_dict=init_dict)
    sess.run(X.initializer, feed_dict=init_dict)
    advance_batch()
    update_covariances()
    init_svds()
    sess.run(init_op, feed_dict=init_dict)  # initialize everything else

    print("Running training.")
    u.reset_time()

    step_lengths = []  # keep track of learning rates
    losses = []
    ratios = []  # actual loss decrease / expected decrease
    grad_norms = []
    pre_grad_norms = []  # preconditioned grad norm squared
    pre_grad_stable_norms = []  # sqrt preconditioned grad norms squared
    target_delta_list = []  # predicted decrease linear approximation
    target_delta2_list = []  # predicted decrease quadratic appromation
    actual_delta_list = []  # actual decrease

    # adaptive line search parameters
    alpha = 0.3  # acceptable fraction of predicted decrease
    beta = 0.8  # how much to shrink when violation
    growth_rate = 1.05  # how much to grow when too conservative

    def update_cov_A(i):
        sess.run(cov_A[i].initializer)

    def update_cov_B2(i):
        sess.run(cov_B2[i].initializer)

    # only update whitening matrix of input activations in the beginning
    if whitening_mode > 0:
        vars_svd_A[1].update()

    # compute t(delta).H.delta/2
    def hessian_quadratic(delta):
        #    update_covariances()
        W = u.unflatten(delta, fs[1:])
        W.insert(0, None)
        total = 0
        for l in range(1, n + 1):
            decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l])
            total += decrement
        return (total / 2).eval()

    # compute t(delta).H^-1.delta/2
    def hessian_quadratic_inv(delta):
        #    update_covariances()
        W = u.unflatten(delta, fs[1:])
        W.insert(0, None)
        total = 0
        for l in range(1, n + 1):
            invB2 = u.pseudo_inverse2(vars_svd_B2[l])
            invA = u.pseudo_inverse2(vars_svd_A[l])
            decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA)
            total += decrement
        return (total / 2).eval()

    # do line search, dump values as csv
    def line_search(initial_value, direction, step, num_steps):
        saved_val = tf.Variable(Wf)
        sess.run(saved_val.initializer)
        pl = tf.placeholder(dtype, shape=(), name="linesearch_p")
        assign_op = Wf.assign(initial_value - direction * step * pl)
        vals = []
        for i in range(num_steps):
            sess.run(assign_op, feed_dict={pl: i})
            vals.append(loss.eval())
        sess.run(Wf.assign(saved_val))  # restore original value
        return vals

    for step in range(num_steps):
        update_covariances()
        if step % whiten_every_n_steps == 0:
            update_svds()

        sess.run(grad.initializer)
        sess.run(pre_grad.initializer)

        lr0, loss0 = sess.run([lr, loss])
        save_params_op.run()

        # regular inverse becomes unstable when grad norm exceeds 1
        stabilized_mode = grad_norm.eval() < 1

        if stabilized_mode and not use_tikhonov:
            update_params_stable_op.run()
        else:
            update_params_op.run()

        loss1 = loss.eval()
        advance_batch()

        # line search stuff
        target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else
                        -pre_grad_stable_dot_grad.eval())
        target_delta = lr0 * target_slope
        target_delta_list.append(target_delta)

        # second order prediction of target delta
        # TODO: the sign is wrong, debug this
        # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28
        if local_quadratics:
            x0 = Wf_copy.eval()
            x_opt = x0 - pre_grad.eval()
            # computes t(x)@H^-1 @(x)/2
            y_opt = loss0 - hessian_quadratic_inv(grad)
            # computes t(x)@H @(x)/2
            y_expected = hessian_quadratic(Wf - x_opt) + y_opt
            target_delta2 = y_expected - loss0
            target_delta2_list.append(target_delta2)

        actual_delta = loss1 - loss0
        actual_slope = actual_delta / lr0
        slope_ratio = actual_slope / target_slope  # between 0 and 1.01
        actual_delta_list.append(actual_delta)

        if do_line_search:
            vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40)
            vals2 = line_search(Wf_copy, grad, lr / 100, 40)
            u.dump(vals1, "line1-%d" % (i, ))
            u.dump(vals2, "line2-%d" % (i, ))

        losses.append(loss0)
        step_lengths.append(lr0)
        ratios.append(slope_ratio)
        grad_norms.append(grad_norm.eval())
        pre_grad_norms.append(pre_grad_norm.eval())
        pre_grad_stable_norms.append(pre_grad_stable_norm.eval())

        if step % report_frequency == 0:
            print(
                "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"
                % (step, loss0, target_delta, actual_delta, slope_ratio,
                   grad_norm.eval(), pre_grad_norm.eval()))

        if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in:
            # shrink if wrong prediction, don't shrink if prediction is tiny
            if slope_ratio < alpha and abs(
                    target_delta) > 1e-6 and adaptive_step:
                print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio))
                print(
                    "Slope optimality %.2f, shrinking learning rate to %.2f" %
                    (
                        slope_ratio,
                        lr0 * beta,
                    ))
                sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta})

            # grow learning rate, slope_ratio .99 worked best for gradient
            elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step:
                print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio))
                print("Growing learning rate to %.2f" % (lr0 * growth_rate))
                sess.run(vard[lr].setter,
                         feed_dict={vard[lr].p: lr0 * growth_rate})

        u.record_time()

    # check against expected loss
    if 'Apple' in sys.version:
        pass
        #    u.dump(losses, "kfac_small_final_mac.csv")
        targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",")
    else:
        pass
        #    u.dump(losses, "kfac_small_final_linux.csv")
        targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",")

    u.check_equal(targets, losses[:len(targets)], rtol=1e-1)
    u.summarize_time()
    print("Test passed")
Exemple #14
0
def main():
  np.random.seed(args.seed)
  tf.set_random_seed(args.seed)

  logger = u.TensorboardLogger(args.run)
  
  with u.timeit("init/session"):
    gpu_options = tf.GPUOptions(allow_growth=False)
    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
    u.register_default_session(sess)   # since default session is Thread-local

  with u.timeit("init/model_init"):
    model = model_creator(args.batch_size, name="main")
    model.initialize_global_vars(verbose=True)
    model.initialize_local_vars()
  
  with u.timeit("init/kfac_init"):
    kfac = Kfac(model_creator, args.kfac_batch_size) 
    kfac.model.initialize_global_vars(verbose=False)
    kfac.model.initialize_local_vars()
    kfac.Lambda.set(args.Lambda)
    kfac.reset()    # resets optimization variables (not model variables)

  if args.mode != 'run':
    opt = tf.train.AdamOptimizer(0.001)
  else:
    opt = tf.train.AdamOptimizer(args.lr)
  grads_and_vars = opt.compute_gradients(model.loss,
                                         var_list=model.trainable_vars)
      
  grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
  grad_new = kfac.correct(grad)
  with u.capture_vars() as adam_vars:
    train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
  with u.timeit("init/adam"):
    sessrun([v.initializer for v in adam_vars])
  
  losses = []
  u.record_time()

  start_time = time.time()
  vloss0 = 0

  # todo, unify the two data outputs
  outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda)
  writer = u.BufferedWriter(outfn, 60)   # get rid?

  start_time = time.time()
  if args.extra_kfac_batch_advance:
    kfac.model.advance_batch()  # advance kfac batch

  if args.kfac_async:
    kfac.start_stats_runners()
    
  for step in range(args.num_steps):
    
    if args.validate_every_n and step%args.validate_every_n == 0:
      loss0, vloss0 = sessrun([model.loss, model.vloss])
    else:
      loss0, = sessrun([model.loss])
    losses.append(loss0)  # TODO: remove this

    logger('loss/loss', loss0, 'loss/vloss', vloss0)
    
    elapsed = time.time()-start_time
    print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0,
                                                     vloss0))
    writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0))

    if args.method=='kfac' and not args.kfac_async:
      kfac.model.advance_batch()
      kfac.update_stats()

    with u.timeit("train"):
      model.advance_batch()
      grad.update()
      with kfac.read_lock():
        grad_new.update()
      train_op.run()
      u.record_time()

    logger.next_step()

  # TODO: use u.global_runs_dir
  # TODO: get rid of u.timeit?
  
  with open('timelines/graphdef.txt', 'w') as f:
    f.write(str(u.get_default_graph().as_graph_def()))

  u.summarize_time()
  
  if args.mode == 'record':
    u.dump_with_prompt(losses, release_test_fn)

  elif args.mode == 'test':
    targets = np.loadtxt('data/'+release_test_fn, delimiter=",")
    u.check_equal(losses, targets, rtol=1e-2)
    u.summarize_difference(losses, targets)
Exemple #15
0
def lbfgs(opfunc, x, config, state, do_verbose):
    """port of lbfgs.lua, using TensorFlow eager mode.
  """

    global final_loss, times

    maxIter = config.maxIter or 20
    maxEval = config.maxEval or maxIter * 1.25
    tolFun = config.tolFun or 1e-5
    tolX = config.tolX or 1e-9
    nCorrection = config.nCorrection or 100
    lineSearch = config.lineSearch
    lineSearchOpts = config.lineSearchOptions
    learningRate = config.learningRate or 1
    isverbose = config.verbose or False

    # verbose function
    if isverbose:
        verbose = verbose_func
    else:
        verbose = lambda x: None

        # evaluate initial f(x) and df/dx
    f, g = opfunc(x)

    f_hist = [f]
    currentFuncEval = 1
    state.funcEval = state.funcEval + 1
    p = g.shape[0]

    # check optimality of initial point
    tmp1 = tf.abs(g)
    if tf.reduce_sum(tmp1) <= tolFun:
        verbose("optimality condition below tolFun")
        return x, f_hist

    # optimize for a max of maxIter iterations
    nIter = 0
    times = []
    while nIter < maxIter:
        start_time = time.time()

        # keep track of nb of iterations
        nIter = nIter + 1
        state.nIter = state.nIter + 1

        ############################################################
        ## compute gradient descent direction
        ############################################################
        if state.nIter == 1:
            d = -g
            old_dirs = []
            old_stps = []
            Hdiag = 1
        else:
            # do lbfgs update (update memory)
            y = g - g_old
            s = d * t
            ys = dot(y, s)

            if ys > 1e-10:
                # updating memory
                if len(old_dirs) == nCorrection:
                    # shift history by one (limited-memory)
                    del old_dirs[0]
                    del old_stps[0]

                # store new direction/step
                old_dirs.append(s)
                old_stps.append(y)

                # update scale of initial Hessian approximation
                Hdiag = ys / dot(y, y)

            # compute the approximate (L-BFGS) inverse Hessian
            # multiplied by the gradient
            k = len(old_dirs)

            # need to be accessed element-by-element, so don't re-type tensor:
            ro = [0] * nCorrection
            for i in range(k):
                ro[i] = 1 / dot(old_stps[i], old_dirs[i])

            # iteration in L-BFGS loop collapsed to use just one buffer
            # need to be accessed element-by-element, so don't re-type tensor:
            al = [0] * nCorrection

            q = -g
            for i in range(k - 1, -1, -1):
                al[i] = dot(old_dirs[i], q) * ro[i]
                q = q - al[i] * old_stps[i]

            # multiply by initial Hessian
            r = q * Hdiag
            for i in range(k):
                be_i = dot(old_stps[i], r) * ro[i]
                r += (al[i] - be_i) * old_dirs[i]

            d = r
            # final direction is in r/d (same object)

        g_old = g
        f_old = f

        ############################################################
        ## compute step length
        ############################################################
        # directional derivative
        gtd = dot(g, d)

        # check that progress can be made along that direction
        if gtd > -tolX:
            verbose("Can not make progress along direction.")
            break

        # reset initial guess for step size
        if state.nIter == 1:
            tmp1 = tf.abs(g)
            t = min(1, 1 / tf.reduce_sum(tmp1))
        else:
            t = learningRate

        # optional line search: user function
        lsFuncEval = 0
        if lineSearch and isinstance(lineSearch) == types.FunctionType:
            # perform line search, using user function
            f, g, x, t, lsFuncEval = lineSearch(opfunc, x, t, d, f, g, gtd,
                                                lineSearchOpts)
            f_hist.append(f)
        else:
            # no line search, simply move with fixed-step
            x += t * d

            if nIter != maxIter:
                # re-evaluate function only if not in last iteration
                # the reason we do this: in a stochastic setting,
                # no use to re-evaluate that function here
                f, g = opfunc(x)

                lsFuncEval = 1
                f_hist.append(f)

        # update func eval
        currentFuncEval = currentFuncEval + lsFuncEval
        state.funcEval = state.funcEval + lsFuncEval

        ############################################################
        ## check conditions
        ############################################################
        if nIter == maxIter:
            break

        if currentFuncEval >= maxEval:
            # max nb of function evals
            verbose('max nb of function evals')
            break

        tmp1 = tf.abs(g)
        if tf.reduce_sum(tmp1) <= tolFun:
            # check optimality
            verbose('optimality condition below tolFun')
            break

        tmp1 = tf.abs(d * t)
        if tf.reduce_sum(tmp1) <= tolX:
            # step size below tolX
            verbose('step size below tolX')
            break

        if tf.abs(f - f_old) < tolX:
            # function value changing less than tolX
            verbose('function value changing less than tolX' +
                    str(tf.abs(f - f_old)))
            break

        if do_verbose:
            print("Step %3d loss %6.5f msec %6.3f" %
                  (nIter, f.numpy(), u.last_time()))
            u.record_time()
            times.append(u.last_time())

        if nIter == maxIter - 1:
            final_loss = f.numpy()

    # save state
    state.old_dirs = old_dirs
    state.old_stps = old_stps
    state.Hdiag = Hdiag
    state.g_old = g_old
    state.f_old = f_old
    state.t = t
    state.d = d

    return x, f_hist, currentFuncEval
def main():
    global mode

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # number of layers
    n = len(fs) - 2

    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10
            # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10
            # self.W1 = nn.Parameter(torch.from_numpy(W1))
            # self.W2 = nn.Parameter(torch.from_numpy(W2))
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(fs[1], -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()

    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)
    covA_inv_saved = [None] * n

    for step in range(10):
        mode = 'standard'
        output = model(data)

        mode = 'capture'
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        del forward_inv[:]
        del backward_inv[:]
        noise.normal_()
        output_hat = Variable(output.data + noise)
        output = model(data)
        err_hat = output_hat - output
        loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize
        loss_hat.backward(retain_graph=True)

        backward.reverse()
        forward.reverse()
        assert len(backward) == n
        assert len(forward) == n
        A = forward[:]
        B = backward[:]

        # compute inverses
        for i in range(n):
            # first layer doesn't change so only compute once
            if i == 0 and covA_inv_saved[i] is not None:
                covA_inv = covA_inv_saved[i]
            else:
                covA_inv = regularized_inverse(A[i] @ t(A[i]) / dsize)
                covA_inv_saved[i] = covA_inv
            forward_inv.append(covA_inv)

            covB_inv = regularized_inverse(B[i] @ t(B[i]) / dsize)
            backward_inv.append(covB_inv)

        mode = 'kfac'
        optimizer.zero_grad()
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize
        loss.backward()
        optimizer.step()

        loss0 = loss.data.cpu().numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    if args.cuda:
        target = 2.337120533
    else:
        target = 2.335612774

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
Exemple #17
0
def main():
    #  global forward, backward, DO_PRINT
    global mode, covA_inv, covB_inv

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [args.batch_size, 28 * 28, 196, 28 * 28]
    # number of layers
    n = len(fs) - 2

    # todo, move to more elegant backprop
    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(784, -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()
    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    losses = []

    covA = [None] * n
    covA_inv = [None] * n
    covB_inv = [None] * n

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)

    # TODO:
    # only do 2 passes like in eager mode
    # integrate with optimizer/same results
    # scale to deep autoencoder
    for step in range(10):
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        output = model(data)
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize

        loss.backward(retain_graph=True)
        backward.reverse()

        loss0 = loss.data[0]

        A = forward[:]
        B = backward[:]
        assert len(B) == n

        del forward[:]
        del backward[:]

        noise.normal_()
        synthetic_data = Variable(output.data + noise)

        err2 = output - synthetic_data
        loss2 = torch.sum(err2 * err2) / 2 / dsize
        optimizer.zero_grad()
        loss2.backward()
        B2 = backward[::-1]
        assert len(B2) == n

        # mode = 'kfac'

        # compute whitened gradient
        pre_dW = []
        for i in range(n):
            # only compute first activation once
            if i > 0:
                covA[i] = A[i] @ t(A[i]) / dsize
                covA_inv[i] = regularized_inverse(covA[i])
            else:
                if covA[i] is None:
                    covA[i] = A[i] @ t(A[i]) / dsize
                    covA_inv[i] = regularized_inverse(covA[i])

            #      else:
            covB2 = B2[i] @ t(B2[i]) / dsize
            covB = B[i] @ t(B[i]) / dsize  # todo: remove

            covB_inv[i] = regularized_inverse(covB2.data)

            whitened_A = covA_inv[i] @ A[i]
            whitened_B = covB_inv[i] @ B[i].data
            pre_dW.append(whitened_B @ t(whitened_A) / dsize)

        params = list(model.parameters())
        assert len(params) == len(pre_dW)
        for i in range(len(params)):
            params[i].data -= lr * pre_dW[i]

        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    loss0 = loss.data.cpu().numpy()  #[0]
    target = 2.360062122

    if 'Apple' in sys.version:
        target = 2.360126972
        target = 2.335654736  # after changing to torch.randn
    if args.cuda:
        target = 2.337174654
        target = 2.337215662  # switching to numpy inverse

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
Exemple #18
0
def kfac_optimizer(model_creator):
    stats_batch_size = 10000
    main_batch_size = 10000

    stats_model, loss, labels = model_creator(stats_batch_size)
    # replace labels_node with synthetic labels

    main_model, _, _ = model_creator(main_batch_size)

    opt = tf.GradientDescentOptimizer(0.2)
    grads_and_vars = opt.compute_gradients(loss)

    trainable_vars = tf.trainable_variables()

    # create SVD and preconditioning variables for matmul vars
    for var in trainable_vars:
        if var not in matmul_registry:
            continue
        dW = u.extract_grad(grads_and_vars, var)
        A[var] = get_activations(var)
        B[var] = get_backprops(var)
        B2[var] = get_backprops2(var)  # get backprops with synthetic labels
        dW[var] = B[var] @ t(A[var])  # todo: sort out dsize division
        cov_A[var] = init_var(A[var] @ t(A[var]) / dsize,
                              "cov_A_%s" % (var.name, ))
        cov_B2[var] = init_var(B2[var] @ t(B2[var]) / dsize,
                               "cov_B2_%s" % (var.name, ))

        vars_svd_A[var] = SvdWrapper(cov_A[var], "svd_A_%d" % (var.name, ))
        vars_svd_B2[var] = SvdWrapper(cov_B2[var], "svd_B2_%d" % (var.name, ))
        whitened_A = u.pseudo_inverse2(vars_svd_A[var]) @ A[var]
        whitened_B2 = u.pseudo_inverse2(vars_svd_B2[var]) @ B[var]
        whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[var]) @ A[var]
        whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[var]) @ B[var]

        pre_dW[var] = (whitened_B2 @ t(whitened_A)) / dsize
        pre_dW_stable[var] = (
            whitened_B2_stable @ t(whitened_A_stable)) / dsize
        dW[var] = (B[var] @ t(A[var])) / dsize

    # create update params ops

    # new_grads_and_vars = []
    # for grad, var in grads_and_vars:
    #   if var in kfac_registry:
    #     pre_A, pre_B = kfac_registry[var]
    #     new_grad_live = pre_B @ grad @ t(pre_A)
    #     new_grads_and_vars.append((new_grad, var))
    #     print("Preconditioning %s"%(var.name))
    #   else:
    #     new_grads_and_vars.append((grad, var))
    # train_op = opt.apply_gradients(new_grads_and_vars)

    # Each variable has an associated gradient, pre_gradient, variable save op
    def update_grad():
        ops = [grad_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def update_pre_grad():
        ops = [pre_grad_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def update_pre_grad2():
        ops = [pre_grad2_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def save_params():
        ops = [var_save_ops[var] for var in trainable_vars]
        sess.run(ops)

    for step in range(num_steps):
        update_covariances()
        if step % whitened_every_n_steps == 0:
            update_svds()

        update_grad()
        update_pre_grad()  # perf todo: update one of these
        update_pre_grad2()  # stable alternative

        lr0, loss0 = sess.run([lr, loss])
        save_params()

        # when grad norm<1, Fisher is unstable, switch to Sqrt(Fisher)
        # TODO: switch to per-matrix normalization
        stabilized_mode = grad_norm.eval() < 1

        if stabilized_mode:
            update_params2()
        else:
            update_params()

        loss1 = loss.eval()
        advance_batch()

        # line search stuff
        target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else
                        -pre_grad_stable_dot_grad.eval())
        target_delta = lr0 * target_slope
        actual_delta = loss1 - loss0
        actual_slope = actual_delta / lr0
        slope_ratio = actual_slope / target_slope  # between 0 and 1.01

        losses.append(loss0)
        step_lengths.append(lr0)
        ratios.append(slope_ratio)

        if step % report_frequency == 0:
            print(
                "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"
                % (step, loss0, target_delta, actual_delta, slope_ratio,
                   grad_norm.eval(), pre_grad_norm.eval()))

        u.record_time()
Exemple #19
0
def main():
    np.random.seed(0)
    tf.set_random_seed(0)

    dtype = np.float32

    train_images = u.get_mnist_images()

    dsize = 10000
    patches = train_images[:, :dsize].astype(dtype)
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    X0 = patches
    lambda_ = 3e-3
    rho = tf.constant(0.1, dtype=dtype)
    beta = 3
    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    # helper to create variables with numpy or TF initial value
    init_dict = {}  # {var_placeholder: init_value}
    vard = {}  # {var: u.VarInfo}

    def init_var(val, name, trainable=False, noinit=False):
        if isinstance(val, tf.Tensor):
            collections = [] if noinit else None
            var = tf.Variable(val, name=name, collections=collections)
        else:
            val = np.array(val)
            assert u.is_numeric, "Unknown type"
            holder = tf.placeholder(dtype,
                                    shape=val.shape,
                                    name=name + "_holder")
            var = tf.Variable(holder, name=name, trainable=trainable)
            init_dict[holder] = val
        var_p = tf.placeholder(var.dtype, var.shape)
        var_setter = var.assign(var_p)
        vard[var] = u.VarInfo(var_setter, var_p)
        return var

    lr = init_var(0.2, "lr")

    Wf = init_var(W0f, "Wf", True)
    Wf_copy = init_var(W0f, "Wf_copy", True)
    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    X = init_var(X0, "X")
    W.insert(0, X)

    def sigmoid(x):
        return tf.sigmoid(x)

    def d_sigmoid(y):
        return y * (1 - y)

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)

    fail_node = tf.Print(0, [0], "fail, this must never run")
    with tf.control_dependencies([fail_node]):
        A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = backprops from sampled labels needed for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              noinit=True)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    pre_dW_stable = [None] * (n + 1)  # preconditioned stable dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    for i in range(1, n + 1):
        cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0])
        cov_A[i] = init_var(cov_op, "cov_A%d" % (i, ))
        cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity(
            B2[i].shape[0])
        cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i],
                                     "svd_A_%d" % (i, ),
                                     do_inverses=True)
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],
                                      "svd_B2_%d" % (i, ),
                                      do_inverses=True)
        whitened_A = vars_svd_A[i].inv @ A[i]
        whitened_B = vars_svd_B2[i].inv @ B[i]
        pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Loss function
    reconstruction = u.L2(err) / (2 * dsize)

    loss = reconstruction

    grad_live = u.flatten(dW[1:])
    pre_grad_live = u.flatten(pre_dW[1:])  # fisher preconditioned gradient
    grad = init_var(grad_live, "grad")
    pre_grad = init_var(pre_grad_live, "pre_grad")

    update_params_op = Wf.assign(Wf - lr * pre_grad).op
    save_params_op = Wf_copy.assign(Wf).op
    pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad)
    grad_norm = tf.reduce_sum(grad * grad)
    pre_grad_norm = u.L2(pre_grad)

    def dump_svd_info(step):
        """Dump singular values and gradient values in those coordinates."""
        for i in range(1, n + 1):
            svd = vars_svd_A[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "A_%d_%d" % (i, step))
            A0 = A[i].eval()
            At0 = v0.T @ A0
            u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step))
            u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step))
            u.dump(s0, "As_%d_%d" % (i, step))

        for i in range(1, n + 1):
            svd = vars_svd_B2[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "B2_%d_%d" % (i, step))
            B0 = B[i].eval()
            Bt0 = v0.T @ B0
            u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step))
            u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step))
            u.dump(s0, "Bs_%d_%d" % (i, step))

    def advance_batch():
        sess.run(sampled_labels.initializer)  # new labels for next call

    def update_covariances():
        ops_A = [cov_A[i].initializer for i in range(1, n + 1)]
        ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)]
        sess.run(ops_A + ops_B2)

    def update_svds():
        vars_svd_A[2].update()
        vars_svd_B2[2].update()
        vars_svd_B2[1].update()

    def init_svds():
        """Initialize our SVD to identity matrices."""
        ops = []
        for i in range(1, n + 1):
            ops.extend(vars_svd_A[i].init_ops)
            ops.extend(vars_svd_B2[i].init_ops)
        sess = tf.get_default_session()
        sess.run(ops)

    init_op = tf.global_variables_initializer()

    from tensorflow.core.protobuf import rewriter_config_pb2

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                    rewrite_options=rewrite_options)
    config = tf.ConfigProto(graph_options=graph_options)

    sess = tf.InteractiveSession(config=config)
    sess.run(Wf.initializer, feed_dict=init_dict)
    sess.run(X.initializer, feed_dict=init_dict)
    advance_batch()
    update_covariances()
    init_svds()
    sess.run(init_op, feed_dict=init_dict)  # initialize everything else

    print("Running training.")
    u.reset_time()

    step_lengths = []  # keep track of learning rates
    losses = []

    # adaptive line search parameters
    alpha = 0.3  # acceptable fraction of predicted decrease
    beta = 0.8  # how much to shrink when violation
    growth_rate = 1.05  # how much to grow when too conservative

    def update_cov_A(i):
        sess.run(cov_A[i].initializer)

    def update_cov_B2(i):
        sess.run(cov_B2[i].initializer)

    # only update whitening matrix of input activations in the beginning
    vars_svd_A[1].update()

    for step in range(40):
        update_covariances()
        update_svds()

        sess.run(grad.initializer)
        sess.run(pre_grad.initializer)

        lr0, loss0 = sess.run([lr, loss])
        update_params_op.run()
        advance_batch()

        losses.append(loss0)
        step_lengths.append(lr0)

        print("Step %d loss %.2f" % (step, loss0))
        u.record_time()

    assert losses[-1] < 0.59
    assert losses[-1] > 0.57
    assert 20e-3 < min(
        u.global_time_list) < 50e-3, "Time should be 40ms on 1080"
    u.summarize_time()
    print("Test passed")
def train(optimizer='sgd', kfac=True, iters=10, verbose=True):
  global mode
  
  torch.manual_seed(1)
  np.random.seed(1)
  if args.cuda:
    torch.cuda.manual_seed(1)

  # feature sizes at each layer
  fs = [dsize, 28*28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28*28]
  n = len(fs) - 2   # number of matmuls

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      for i in range(1, n+1):
        W0 = u.ng_init(fs[i+1], fs[i])
        setattr(self, 'W'+str(i), nn.Parameter(torch.from_numpy(W0)))

    def forward(self, input):
      x = input.view(fs[1], -1)
      for i in range(1, n+1):
        W = getattr(self, 'W'+str(i))
        x = nonlin(kfac_matmul(W, x))
      return x.view_as(input)

  model = Net()

  if args.cuda:
    model.cuda()

  data0 = u.get_mnist_images()
  data0 = data0[:, :dsize].astype(dtype)
  data = Variable(torch.from_numpy(data0))
  if args.cuda:
    data = data.cuda()

  model.train()
  if optimizer == 'sgd':
    optimizer = optim.SGD(model.parameters(), lr=lr)
  elif optimizer == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=lr)
  else:
    assert False, 'unknown optimizer '+optimizer
    
  noise = torch.Tensor(*data.data.shape).type(torch_dtype)
  covA_inv_saved = [None]*n
  losses = []
  
  for step in range(10):
    mode = 'standard'
    output = model(data)
    
    mode = 'capture'
    optimizer.zero_grad()
    del As[:], Bs[:], As_inv[:], Bs_inv[:]
    noise.normal_()

    output_hat = Variable(output.data+noise)
    err_hat = output_hat - output
    loss_hat = torch.sum(err_hat*err_hat)/2/dsize
    loss_hat.backward(retain_graph=True)
    
    # compute inverses
    for i in range(n):
      # first layer activations don't change, only compute once
      if i == 0 and covA_inv_saved[i] is not None:
        covA_inv = covA_inv_saved[i]
      else:
        covA_inv = regularized_inverse(As[i] @ As[i].t()/dsize)
        covA_inv_saved[i] = covA_inv
      As_inv.append(covA_inv)

      covB = (Bs[i]@Bs[i].t())*dsize
      # alternative formula: slower but numerically better result
      # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize
      
      covB_inv = regularized_inverse(covB)
      Bs_inv.append(covB_inv)

    if kfac:
      mode = 'kfac'
    else:
      mode = 'standard'
    optimizer.zero_grad()
    err = output - data
    loss = torch.sum(err*err)/2/dsize
    loss.backward()
    optimizer.step()
    
    loss0 = loss.data.cpu().numpy()[0]
    losses.append(loss0)
    if verbose:
      print("Step %3d loss %10.9f"%(step, loss0))
    u.record_time()

  return losses  
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #22
0
def train(optimizer='sgd',
          nonlin=torch.sigmoid,
          kfac=True,
          iters=10,
          lr=0.2,
          newton_matrix='stochastic',
          eval_every_n_steps=1,
          print_interval=200):
    """Train on first 10k MNIST examples, evaluate on second 10k."""

    u.reset_time()
    dsize = 10000

    # model options
    dtype = np.float32
    torch_dtype = 'torch.FloatTensor'

    use_cuda = torch.cuda.is_available()
    if use_cuda:
        torch_dtype = 'torch.cuda.FloatTensor'

    INVERSE_METHOD = 'numpy'  # numpy, gpu

    As = []
    Bs = []
    As_inv = []
    Bs_inv = []
    mode = 'capture'  # 'capture', 'kfac', 'standard'

    class KfacAddmm(Function):
        @staticmethod
        def _get_output(ctx, arg, inplace=False):
            if inplace:
                ctx.mark_dirty(arg)
                return arg
            else:
                return arg.new().resize_as_(arg)

        @staticmethod
        def forward(ctx,
                    add_matrix,
                    matrix1,
                    matrix2,
                    beta=1,
                    alpha=1,
                    inplace=False):
            ctx.save_for_backward(matrix1, matrix2)
            output = KfacAddmm._get_output(ctx, add_matrix, inplace=inplace)
            return torch.addmm(beta,
                               add_matrix,
                               alpha,
                               matrix1,
                               matrix2,
                               out=output)

        @staticmethod
        def backward(ctx, grad_output):
            matrix1, matrix2 = ctx.saved_variables
            grad_matrix1 = grad_matrix2 = None

            if mode == 'capture':
                Bs.insert(0, grad_output.data)
                As.insert(0, matrix2.data)
            elif mode == 'kfac':
                B = grad_output.data
                A = matrix2.data
                kfac_A = As_inv.pop() @ A
                kfac_B = Bs_inv.pop() @ B
                grad_matrix1 = Variable(torch.mm(kfac_B, kfac_A.t()))
            elif mode == 'standard':
                grad_matrix1 = torch.mm(grad_output, matrix2.t())

            else:
                assert False, 'unknown mode ' + mode

            if ctx.needs_input_grad[2]:
                grad_matrix2 = torch.mm(matrix1.t(), grad_output)

            return None, grad_matrix1, grad_matrix2, None, None, None

    def kfac_matmul(mat1, mat2):
        output = Variable(mat1.data.new(mat1.data.size(0), mat2.data.size(1)))
        return KfacAddmm.apply(output, mat1, mat2, 0, 1, True)

    torch.manual_seed(1)
    np.random.seed(1)
    if use_cuda:
        torch.cuda.manual_seed(1)

    # feature sizes at each layer
    fs = [dsize, 28 * 28, 1024, 1024, 1024, 196, 1024, 1024, 1024, 28 * 28]
    n = len(fs) - 2  # number of matmuls

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(fs[1], -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(kfac_matmul(W, x))
            return x.view_as(input)

    model = Net()

    if use_cuda:
        model.cuda()

    images = u.get_mnist_images()
    train_data0 = images[:, :dsize].astype(dtype)
    train_data = Variable(torch.from_numpy(train_data0))
    test_data0 = images[:, dsize:2 * dsize].astype(dtype)
    test_data = Variable(torch.from_numpy(test_data0))
    if use_cuda:
        train_data = train_data.cuda()
        test_data = test_data.cuda()

    model.train()
    if optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=lr)
    elif optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
    else:
        assert False, 'unknown optimizer ' + optimizer

    noise = torch.Tensor(*train_data.data.shape).type(torch_dtype)
    assert fs[-1] <= dsize
    padding = dsize - fs[-1]
    zero_mat = torch.zeros((fs[-1], padding))
    frozen = torch.cat([torch.eye(fs[-1]), zero_mat], 1).type(torch_dtype)

    covA_inv_saved = [None] * n
    losses = []
    vlosses = []

    for step in range(iters):
        mode = 'standard'
        output = model(train_data)

        if kfac:
            mode = 'capture'
            optimizer.zero_grad()
            del As[:], Bs[:], As_inv[:], Bs_inv[:]

            if newton_matrix == 'stochastic':
                noise.normal_()
                err_add = noise
            elif newton_matrix == 'exact':
                err_add = frozen
            else:
                assert False, 'unknown method for newton matrix ' + newton_matrix

            output_hat = Variable(output.data + err_add)
            err_hat = output_hat - output

            loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize
            loss_hat.backward(retain_graph=True)

            # compute inverses
            for i in range(n):
                # first layer activations don't change, only compute once
                if i == 0 and covA_inv_saved[i] is not None:
                    covA_inv = covA_inv_saved[i]
                else:
                    covA_inv = regularized_inverse(As[i] @ As[i].t() / dsize)
                    covA_inv_saved[i] = covA_inv
                As_inv.append(covA_inv)

                covB = (Bs[i] @ Bs[i].t()) * dsize
                # alternative formula: slower but numerically better result
                # covB = (Bs[i]*dsize)@(Bs[i].t()*dsize)/dsize

                covB_inv = regularized_inverse(covB)
                Bs_inv.append(covB_inv)
            mode = 'kfac'

        else:
            mode = 'standard'

        if step % eval_every_n_steps == 0:
            old_mode = mode
            mode = 'standard'
            test_output = model(test_data)
            test_err = test_data - test_output
            test_loss = torch.sum(test_err * test_err) / 2 / dsize
            vloss0 = test_loss.data.cpu().numpy()[0]
            vlosses.append(vloss0)
            mode = old_mode

        optimizer.zero_grad()
        err = output - train_data
        loss = torch.sum(err * err) / 2 / dsize
        loss.backward()
        optimizer.step()

        loss0 = loss.data.cpu().numpy()[0]
        losses.append(loss0)
        if step % print_interval == 0:
            print("Step %3d loss %10.9f" % (step, loss0))

        u.record_time()

    return losses, vlosses
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #24
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    logger = u.TensorboardLogger(args.run)

    with u.timeit("init/session"):

        rewrite_options = None
        try:
            from tensorflow.core.protobuf import rewriter_config_pb2
            rewrite_options = rewriter_config_pb2.RewriterConfig(
                disable_model_pruning=True,
                constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
                memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
        except:
            pass

        optimizer_options = tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0)
        graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                        rewrite_options=rewrite_options)
        gpu_options = tf.GPUOptions(allow_growth=False)
        config = tf.ConfigProto(graph_options=graph_options,
                                gpu_options=gpu_options,
                                log_device_placement=False)

        sess = tf.InteractiveSession(config=config)
        u.register_default_session(
            sess)  # since default session is Thread-local

    with u.timeit("init/model_init"):
        model = model_creator(args.batch_size, name="main")
        model.initialize_global_vars(verbose=True)
        model.initialize_local_vars()

    kfac_lib.numeric_inverse = args.numeric_inverse
    with u.timeit("init/kfac_init"):
        kfac = Kfac(model_creator, args.kfac_batch_size)
        kfac.model.initialize_global_vars(verbose=False)
        kfac.model.initialize_local_vars()
        kfac.Lambda.set(args.Lambda)
        kfac.reset()  # resets optimization variables (not model variables)

    if args.mode != 'run':
        opt = tf.train.AdamOptimizer(0.001)
    else:
        opt = tf.train.AdamOptimizer(args.lr)
    grads_and_vars = opt.compute_gradients(model.loss,
                                           var_list=model.trainable_vars)

    grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
    grad_new = kfac.correct(grad)
    with u.capture_vars() as adam_vars:
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    with u.timeit("init/adam"):
        sessrun([v.initializer for v in adam_vars])

    losses = []
    u.record_time()

    start_time = time.time()
    vloss0 = 0

    # todo, unify the two data outputs
    outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda)

    start_time = time.time()
    if args.extra_kfac_batch_advance:
        kfac.model.advance_batch()  # advance kfac batch

    if args.kfac_async:
        kfac.start_stats_runners()

    for step in range(args.num_steps):

        if args.validate_every_n and step % args.validate_every_n == 0:
            loss0, vloss0 = sessrun([model.loss, model.vloss])
        else:
            loss0, = sessrun([model.loss])
        losses.append(loss0)  # TODO: remove this

        logger('loss/loss', loss0, 'loss/vloss', vloss0)

        elapsed = time.time() - start_time
        start_time = time.time()
        print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" %
              (elapsed * 1e3, step, loss0, vloss0))

        if args.method == 'kfac' and not args.kfac_async:
            kfac.model.advance_batch()
            kfac.update_stats()

        with u.timeit("train"):
            model.advance_batch()
            with u.timeit("grad.update"):
                grad.update()
            with kfac.read_lock():
                grad_new.update()
            u.run(train_op)
            u.record_time()

        logger.next_step()

    # TODO: use u.global_runs_dir
    # TODO: get rid of u.timeit?

    with open('timelines/graphdef.txt', 'w') as f:
        f.write(str(u.get_default_graph().as_graph_def()))

    u.summarize_time()

    if args.mode == 'record':
        u.dump_with_prompt(losses, release_test_fn)

    elif args.mode == 'test':
        targets = np.loadtxt('data/' + release_test_fn, delimiter=",")
        u.check_equal(losses, targets, rtol=1e-2)
        u.summarize_difference(losses, targets)
        assert u.last_time() < 800, "Expected 648 on GTX 1080"
def rotations2_natural_empirical():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.000001, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))

    del dW[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # inverse fisher preconditioner
    grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)],
                      axis=0)
    fisher = grads @ tf.transpose(grads) / dsize
    ifisher = u.pseudo_inverse(fisher)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
    
    if adaptive_step_frequency and adaptive_step and step>adaptive_step_burn_in:
      # shrink if wrong prediction, don't shrink if prediction is tiny
      if slope_ratio < alpha and abs(target_delta)>1e-6 and adaptive_step:
        print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio))
        print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,))
        sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0*beta})
        
      # grow learning rate, slope_ratio .99 worked best for gradient
      elif step>0 and i%50 == 0 and slope_ratio>0.90 and adaptive_step:
          print("%.2f %.2f %.2f"%(loss0, loss1, slope_ratio))
          print("Growing learning rate to %.2f"%(lr0*growth_rate))
          sess.run(vard[lr].setter, feed_dict={vard[lr].p:
                                               lr0*growth_rate})

    u.record_time()

  # check against expected loss
  if 'Apple' in sys.version:
    pass
    #    u.dump(losses, "kfac_small_final_mac.csv")
    targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",")
  else:
    pass
    #    u.dump(losses, "kfac_small_final_linux.csv")
    targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",")

  if len(sys.argv)>1 and sys.argv[1]=="test":
    # GPU losses are quite noisy, set rtol high
    u.check_equal(targets, losses[:len(targets)], rtol=1e-3)
    
def rotations2_natural_sampled_kfac(num_samples=1):
    tf.reset_default_graph()
    np.random.seed(0)
    tf.set_random_seed(0)

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A2 = [0] * (n + 2)  # augmented forward props for natural gradient
    A[0] = u.Identity(dsize)
    A2[0] = u.Identity(dsize * num_samples)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))
        if i == 0:
            # replicate dataset multiple times corresponding to number of samples
            A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1)
        else:
            A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)

    # lower learning rate by 10x
    lr = tf.Variable(0.01, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B2 = [0] * (n + 1)
    B[n] = -err / dsize
    B2[n] = tf.random_normal((f(n), dsize * num_samples),
                             0,
                             1,
                             seed=0,
                             dtype=dtype)
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))
        B2[i] = tf.matmul(tf.transpose(W[i + 1]),
                          B2[i + 1],
                          name="B2" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    dW2 = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
        dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i))

    del dW[0]  # get rid of W[0] update
    del dW2[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # todo: divide both activations and backprops by size for cov calc

    # Kronecker factored covariance blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            if i == j:
                acov = A2[i] @ t(A2[j]) / (dsize * num_samples)
                bcov = B2[i] @ t(B2[j]) / (dsize * num_samples)
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)),
                                dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ifisher = u.concat_blocks(iblocks)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #28
0
def lbfgs(opfunc, x, config, state, do_verbose):
  """port of lbfgs.lua, using TensorFlow eager mode.
  """

  global final_loss, times
  
  maxIter = config.maxIter or 20
  maxEval = config.maxEval or maxIter*1.25
  tolFun = config.tolFun or 1e-5
  tolX = config.tolX or 1e-9
  nCorrection = config.nCorrection or 100
  lineSearch = config.lineSearch
  lineSearchOpts = config.lineSearchOptions
  learningRate = config.learningRate or 1
  isverbose = config.verbose or False

  # verbose function
  if isverbose:
    verbose = verbose_func
  else:
    verbose = lambda x: None

    # evaluate initial f(x) and df/dx
  f, g = opfunc(x)

  f_hist = [f]
  currentFuncEval = 1
  state.funcEval = state.funcEval + 1
  p = g.shape[0]

  # check optimality of initial point
  tmp1 = tf.abs(g)
  if tf.reduce_sum(tmp1) <= tolFun:
    verbose("optimality condition below tolFun")
    return x, f_hist

  # optimize for a max of maxIter iterations
  nIter = 0
  times = []
  while nIter < maxIter:
    start_time = time.time()
    
    # keep track of nb of iterations
    nIter = nIter + 1
    state.nIter = state.nIter + 1

    ############################################################
    ## compute gradient descent direction
    ############################################################
    if state.nIter == 1:
      d = -g
      old_dirs = []
      old_stps = []
      Hdiag = 1
    else:
      # do lbfgs update (update memory)
      y = g - g_old
      s = d*t
      ys = dot(y, s)
      
      if ys > 1e-10:
        # updating memory
        if len(old_dirs) == nCorrection:
          # shift history by one (limited-memory)
          del old_dirs[0]
          del old_stps[0]

        # store new direction/step
        old_dirs.append(s)
        old_stps.append(y)

        # update scale of initial Hessian approximation
        Hdiag = ys/dot(y, y)

      # compute the approximate (L-BFGS) inverse Hessian 
      # multiplied by the gradient
      k = len(old_dirs)

      # need to be accessed element-by-element, so don't re-type tensor:
      ro = [0]*nCorrection
      for i in range(k):
        ro[i] = 1/dot(old_stps[i], old_dirs[i])
        

      # iteration in L-BFGS loop collapsed to use just one buffer
      # need to be accessed element-by-element, so don't re-type tensor:
      al = [0]*nCorrection

      q = -g
      for i in range(k-1, -1, -1):
        al[i] = dot(old_dirs[i], q) * ro[i]
        q = q - al[i]*old_stps[i]

      # multiply by initial Hessian
      r = q*Hdiag
      for i in range(k):
        be_i = dot(old_stps[i], r) * ro[i]
        r += (al[i]-be_i)*old_dirs[i]
        
      d = r
      # final direction is in r/d (same object)

    g_old = g
    f_old = f
    
    ############################################################
    ## compute step length
    ############################################################
    # directional derivative
    gtd = dot(g, d)

    # check that progress can be made along that direction
    if gtd > -tolX:
      verbose("Can not make progress along direction.")
      break

    # reset initial guess for step size
    if state.nIter == 1:
      tmp1 = tf.abs(g)
      t = min(1, 1/tf.reduce_sum(tmp1))
    else:
      t = learningRate


    # optional line search: user function
    lsFuncEval = 0
    if lineSearch and isinstance(lineSearch) == types.FunctionType:
      # perform line search, using user function
      f,g,x,t,lsFuncEval = lineSearch(opfunc,x,t,d,f,g,gtd,lineSearchOpts)
      f_hist.append(f)
    else:
      # no line search, simply move with fixed-step
      x += t*d
      
      if nIter != maxIter:
        # re-evaluate function only if not in last iteration
        # the reason we do this: in a stochastic setting,
        # no use to re-evaluate that function here
        f, g = opfunc(x)
        
        lsFuncEval = 1
        f_hist.append(f)


    # update func eval
    currentFuncEval = currentFuncEval + lsFuncEval
    state.funcEval = state.funcEval + lsFuncEval

    ############################################################
    ## check conditions
    ############################################################
    if nIter == maxIter:
      break

    if currentFuncEval >= maxEval:
      # max nb of function evals
      verbose('max nb of function evals')
      break

    tmp1 = tf.abs(g)
    if tf.reduce_sum(tmp1) <=tolFun:
      # check optimality
      verbose('optimality condition below tolFun')
      break
    
    tmp1 = tf.abs(d*t)
    if tf.reduce_sum(tmp1) <= tolX:
      # step size below tolX
      verbose('step size below tolX')
      break

    if tf.abs(f-f_old) < tolX:
      # function value changing less than tolX
      verbose('function value changing less than tolX'+str(tf.abs(f-f_old)))
      break

    if do_verbose:
      print("Step %3d loss %6.5f msec %6.3f"%(nIter, f.numpy(), u.last_time()))
      u.record_time()
      times.append(u.last_time())

    if nIter == maxIter - 1:
      final_loss = f.numpy()


  # save state
  state.old_dirs = old_dirs
  state.old_stps = old_stps
  state.Hdiag = Hdiag
  state.g_old = g_old
  state.f_old = f_old
  state.t = t
  state.d = d

  return x, f_hist, currentFuncEval
    expected_slope = -grad2_norm_op.eval()

    # ratio of best possible slope to actual slope
    # don't divide by actual slope because that can be 0
    slope_ratio = abs(actual_slope)/abs(expected_slope)
    costs.append(cost0)
    step_lengths.append(lr0)
    ratios.append(slope_ratio)

    if i%10 == 0:
      print("Learning rate: %f"% (lr0,))
      print("Cost %.2f, expected decrease %.2f, actual decrease, %.2f ratio %.2f"%(cost0, expected_delta, actual_delta, slope_ratio))

    # don't shrink learning rate once results are very close to minimum
    if slope_ratio < alpha and abs(target_delta)>1e-6:
      print("%.2f %.2f %.2f"%(cost0, cost1, slope_ratio))
      print("Slope optimality %.2f, shrinking learning rate to %.2f"%(slope_ratio, lr0*beta,))
      sess.run(lr_set, feed_dict={lr_p: lr0*beta})
    else:
      # see if our learning rate got too conservative, and increase it
      if i>0 and i%10 == 0 and slope_ratio>0.99:
        print("%.2f %.2f %.2f"%(cost0, cost1, slope_ratio))
        print("Growing learning rate to %.2f"%(lr0*growth_rate))
        sess.run(lr_set, feed_dict={lr_p: lr0*growth_rate})

    u.record_time()

  u.dump(step_lengths, "step_lengths_ada.csv")
#  u.dump(costs, "costs_ada.csv")
#  u.dump(ratios, "ratios_ada.csv")