Exemple #1
0
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(1)
    tf.set_random_seed(1)

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2
    train_images = np.asarray([[0, 1], [2, 3]]).astype(dtype)
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = np.asarray([[0., 1], [2, 3]]).astype(dtype) / 10
    W1_0 = np.asarray([[4., 5], [6, 7]]).astype(dtype) / 10
    W0f = u.flatten([W0_0, W1_0])
    Wf = tf.constant(W0f)

    losses = []
    for step in range(10):
        loss, output, grad, kfac_grad = loss_and_output_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        u.record_time()

    u.summarize_time()
    target = 1.252017617  # without random sampling
    target = 1.256854534  # with random sampling but fixed seed
    target = 0.000359572  # with random sampling and linear
    target = 1.251557469  # with random sampling

    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
Exemple #2
0
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False):
  global final_loss, W_flat
  tf.set_random_seed(seed)
  np.random.seed(seed)
  
  images = tf.constant(u.get_mnist_images(batch_size).T)
  images = images[:batch_size]
  if cuda:
    images = images.gpu()
  data = images

  if cuda:
    device='/gpu:0'
  else:
    device=''

  device_ctx = tf.device(device)
  device_ctx.__enter__()

  visible_size = 28*28
  hidden_size = 196
  initial_val = tf.zeros([visible_size*hidden_size])
  if W_flat is None:
    W_flat = tfe.Variable(initial_val, name='W_flat')
  W_flat.assign(initial_val)
  
  
  def loss_fn(w_flat):
    w = tf.reshape(w_flat, [visible_size, hidden_size])
    x = tf.matmul(data, w)
    x = tf.sigmoid(x)
    x = tf.matmul(x, w, transpose_b=True)
    x = tf.sigmoid(x)
    return tf.reduce_mean(tf.square(x-data))

  value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn)
  def opfunc(x):  # returns (value, gradient)
    value, grads = value_and_gradients_fn(x)
    return value, grads[0]
      
  # initialize weights
  W_flat.assign(u.ng_init(visible_size, hidden_size).flatten())

  state = Struct()
  config = Struct()
  config.maxIter = iters
  config.nCorrection = history
  config.verbose = True
  x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose)

  if verbose:
    u.summarize_time()

  s = ','.join(["%f"%(n,) for n in times[2:]])
  print('{', s,'}')
  
  return final_loss
def main():
  losses = train('sgd', kfac=True, iters=10, verbose=True)
  u.summarize_time()
  print(losses)
  loss0 = losses[-1]

  if args.cuda:
    target = 38.781795502
  else:
    target = 0
  assert abs(loss0-target)<1e-9, abs(loss0-target)
Exemple #4
0
def main():
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    images = torch.Tensor(u.get_mnist_images().T)
    images = images[:args.batch_size]
    if args.cuda:
        images = images.cuda()
    data = Variable(images)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.encoder = nn.Linear(args.visible_size,
                                     args.hidden_size,
                                     bias=False)
            self.decoder = nn.Linear(args.hidden_size,
                                     args.visible_size,
                                     bias=False)

        def forward(self, input):
            x = input.view(-1, args.visible_size)
            x = self.encoder(x)
            x = F.sigmoid(x)
            x = self.decoder(x)
            x = F.sigmoid(x)
            return x.view_as(input)

    # initialize model and weights
    model = Net()
    params1, params2 = list(model.parameters())
    params1.data = torch.Tensor(
        u.ng_init(args.visible_size, args.hidden_size).T)
    params2.data = torch.Tensor(
        u.ng_init(args.hidden_size, args.visible_size).T)
    if args.cuda:
        model.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=args.lr)
    for step in range(args.iters):
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data)
        loss0 = loss.data[0]
        loss.backward()
        optimizer.step()

        print("Step %3d loss %6.5f" % (step, loss0))
        u.record_time()

    u.summarize_time()
Exemple #5
0
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False):
    global final_loss, W_flat
    tf.set_random_seed(seed)
    np.random.seed(seed)

    images = tf.constant(u.get_mnist_images(batch_size).T)
    images = images[:batch_size]
    if cuda:
        images = images.gpu()
    data = images

    if cuda:
        device = '/gpu:0'
    else:
        device = ''

    device_ctx = tf.device(device)
    device_ctx.__enter__()

    visible_size = 28 * 28
    hidden_size = 196
    initial_val = tf.zeros([visible_size * hidden_size])
    if W_flat is None:
        W_flat = tfe.Variable(initial_val, name='W_flat')
    W_flat.assign(initial_val)

    def loss_fn(w_flat):
        w = tf.reshape(w_flat, [visible_size, hidden_size])
        x = tf.matmul(data, w)
        x = tf.sigmoid(x)
        x = tf.matmul(x, w, transpose_b=True)
        x = tf.sigmoid(x)
        return tf.reduce_mean(tf.square(x - data))

    value_and_gradients_fn = tfe.value_and_gradients_function(loss_fn)

    def opfunc(x):  # returns (value, gradient)
        value, grads = value_and_gradients_fn(x)
        return value, grads[0]

    # initialize weights
    W_flat.assign(u.ng_init(visible_size, hidden_size).flatten())

    state = Struct()
    config = Struct()
    config.maxIter = iters
    config.verbose = True
    x, f_hist, currentFuncEval = lbfgs(opfunc, W_flat, config, state, verbose)

    if verbose:
        u.summarize_time()

    return final_loss
def do_run(train_op):
  sess = setup_session()
  observed_losses = []
  u.reset_time()
  for i in range(do_run_iters):
    loss0 = sess.run(loss)
    print(loss0)
    observed_losses.append(loss0)
    sess.run(train_op)
    u.record_time()
  u.summarize_time()
  return observed_losses
Exemple #7
0
def main():
  losses,vlosses = train(optimizer='sgd', kfac=True, nonlin=F.sigmoid, iters=10,
                 print_interval=1, lr=0.2)
  u.summarize_time()
  print(losses)
  loss0 = losses[-1]
  v = Variable('asdf')

  use_cuda = torch.cuda.is_available()
  if use_cuda:
    target = 38.781795502
  else:
    target = 0
  assert abs(loss0-target)<1e-9, abs(loss0-target)
def benchmark_execute(dims, iters, dtype):
    A = tf.random_uniform((dims, dims), dtype=dtype)
    B = tf.random_uniform((dims, dims), dtype=dtype)
    prods = []
    for i in range(iters):
        prods.append(u.khatri_rao(A, B))
    elapsed_times = []
    sess = tf.Session()
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        time0 = time.time()
        sess.run(tf.group(*prods))
        elapsed_times.append(time.time() - time0)
        u.record_time()
    u.summarize_time()
def benchmark_execute(dims, iters, dtype):
  A = tf.random_uniform((dims, dims), dtype=dtype)
  B = tf.random_uniform((dims, dims), dtype=dtype)
  prods = []
  for i in range(iters):
    prods.append(u.khatri_rao(A,B))
  elapsed_times = []
  sess = tf.Session()
  elapsed_times = []
  u.reset_time()
  for i in range(10):
    time0 = time.time()
    sess.run(tf.group(*prods))
    elapsed_times.append(time.time()-time0)
    u.record_time()
  u.summarize_time()
def complex_train_test():

    np.random.seed(0)

    do_images = True

    train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte')
    dsize = 10000
    patches = train_images[:, :dsize]
    fs = [dsize, 28 * 28, 196, 28 * 28]
    cost, train_op = cost_and_grad(fs=fs,
                                   X0=patches,
                                   lambda_=3e-3,
                                   rho=0.1,
                                   beta=3,
                                   lr=0.1)

    sess = tf.get_default_session()

    u.reset_time()
    old_cost = sess.run(cost)
    old_i = 0
    frame_count = 0
    costs = []
    for i in range(2000):
        cost0, _ = sess.run([cost, train_op])
        costs.append(cost0)
        if i % 100 == 0:
            print(cost0)
            # filters are transposed in visualization
        if ((old_cost - cost0) / old_cost > 0.05
                or i - old_i > 50) and do_images:
            Wf_ = sess.run("Wf_var/read:0")
            W1_ = u.unflatten_np(Wf_, fs[1:])[0]
            display_network.display_network(W1_.T,
                                            filename="pics/weights-%03d.png" %
                                            (frame_count, ))
            frame_count += 1
            old_cost = cost0
            old_i = i
        u.record_time()

    #  u.dump(costs, "costs_adam.csv")
    u.dump(costs, "costs_adam_bn1.csv")
    u.summarize_time()
Exemple #11
0
def main():
  tf.set_random_seed(args.seed)
  np.random.seed(args.seed)
  
  images = tf.constant(u.get_mnist_images().T)
  images = images[:args.batch_size]
  if args.cuda:
    images = images.as_gpu_tensor()
  data = images

  if args.cuda:
    device='/gpu:0'
  else:
    device=''

  with tf.device(device):
    encoder = tf.layers.Dense(units=args.hidden_size, use_bias=False,
                            activation=tf.sigmoid)
    decoder = tf.layers.Dense(units=args.visible_size, use_bias=False,
                              activation=tf.sigmoid)
    def loss_fn(inputs):
      predictions = decoder(encoder(inputs))
      return tf.reduce_mean(tf.square(predictions-inputs))
    value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)

    # initialize weights
    loss_fn(data)
    params1 = encoder.weights[0]
    params2 = decoder.weights[0]
    params1.assign(u.ng_init(args.visible_size, args.hidden_size))
    params2.assign(u.ng_init(args.hidden_size, args.visible_size))

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=args.lr)
    for step in range(args.iters):
      value, grads_and_vars = value_and_gradients_fn(data)
      optimizer.apply_gradients(grads_and_vars)

      print("Step %3d loss %6.5f"%(step, value.numpy()))
      u.record_time()

    u.summarize_time()
Exemple #12
0
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(0)
    tf.set_random_seed(0)

    train_images = u.get_mnist_images()
    dsize = 1000
    fs = [dsize, 28 * 28, 196, 28 * 28]  # layer sizes
    lambda_ = 3e-3

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2
    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])
    Wf = tf.constant(W0f)
    assert Wf.dtype == tf.float32
    lr = tf.constant(0.2)

    losses = []
    for step in range(10):
        loss, grad, kfac_grad = loss_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %d loss %.2f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        if step >= 4:
            assert loss < 17.6
        u.record_time()

    u.summarize_time()
    assert losses[-1] < 0.8
    assert losses[-1] > 0.78
    assert 20e-3 < min(u.global_time_list) < 120e-3
def main():
    global fs, X, n, f, dsize, lambda_

    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    if args.cuda:
        device = '/gpu:0'
    else:
        device = '/cpu:0'
    device_context = tf.device(device)
    device_context.__enter__()

    X = tf.constant(train_images[:, :dsize].astype(dtype))

    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0, W1_0])
    Wf = tf.constant(W0f)
    assert Wf.dtype == tf.float32
    lr = tf.constant(0.2)

    losses = []
    for step in range(40):
        loss, grad, kfac_grad = loss_and_grad(Wf)
        loss0 = loss.numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        losses.append(loss0)

        Wf -= lr * kfac_grad
        if step >= 4:
            assert loss < 17.6
        u.record_time()

    u.summarize_time()
    assert losses[-1] < 0.59
    assert losses[-1] > 0.57
    assert 20e-3 < min(
        u.global_time_list) < 50e-3, "Time should be 30ms on 1080"
          print("Growing learning rate to %.2f"%(lr0*growth_rate))
          sess.run(vard[lr].setter, feed_dict={vard[lr].p:
                                               lr0*growth_rate})

    u.record_time()

  # check against expected loss
  if 'Apple' in sys.version:
    pass
    #    u.dump(losses, "kfac_small_final_mac.csv")
    targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",")
  else:
    pass
    #    u.dump(losses, "kfac_small_final_linux.csv")
    targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",")

  if len(sys.argv)>1 and sys.argv[1]=="test":
    # GPU losses are quite noisy, set rtol high
    u.check_equal(targets, losses[:len(targets)], rtol=1e-3)
    
  u.dump(losses, "%s_losses_%d.csv"%(prefix ,whitening_mode,))
  u.dump(step_lengths, "%s_step_lengths_%d.csv"%(prefix, whitening_mode,))
  u.dump(ratios, "%s_ratios_%d.csv"%(prefix, whitening_mode,))
  u.dump(grad_norms, "%s_grad_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(pre_grad_norms, "%s_pre_grad_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(pre_grad_stable_norms, "%s_pre_grad_stable_norms_%d.csv"%(prefix, whitening_mode,))
  u.dump(target_delta_list, "%s_target_delta_%d.csv"%(prefix, whitening_mode,))
  u.dump(target_delta2_list, "%s_target_delta2_%d.csv"%(prefix, whitening_mode,))
  u.dump(actual_delta_list, "%s_actual_delta_%d.csv"%(prefix, whitening_mode,))
  u.summarize_time()
Exemple #15
0
def benchmark(batch_size, iters, seed=1, cuda=True, verbose=False):
    global step, final_loss

    step = 0
    final_loss = None

    torch.manual_seed(seed)
    np.random.seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    visible_size = 28 * 28
    hidden_size = 196

    images = torch.Tensor(u.get_mnist_images(batch_size).T)
    images = images[:batch_size]
    if cuda:
        images = images.cuda()
    data = Variable(images)

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size))

        def forward(self, input):
            x = input.view(-1, visible_size)
            x = torch.sigmoid(torch.mm(x, self.encoder))
            x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1)))
            return x.view_as(input)

    # initialize model and weights
    model = Net()
    model.encoder.data = torch.Tensor(u.ng_init(visible_size, hidden_size))
    if cuda:
        model.cuda()

    model.train()
    optimizer = optim.LBFGS(model.parameters(),
                            max_iter=iters,
                            history_size=100,
                            lr=1.0)

    def closure():
        global step, final_loss
        optimizer.zero_grad()
        output = model(data)
        loss = F.mse_loss(output, data)
        if verbose:
            loss0 = loss.data[0]
            print("Step %3d loss %6.5f msec %6.3f" %
                  (step, loss0, u.last_time()))
        step += 1
        if step == iters:
            final_loss = loss.data[0]
        loss.backward()
        u.record_time()
        return loss

    optimizer.step(closure)

    output = model(data)
    loss = F.mse_loss(output, data)
    loss0 = loss.data[0]

    if verbose:
        u.summarize_time()

    return final_loss
Exemple #16
0
def main():
    #  global forward, backward, DO_PRINT
    global mode, covA_inv, covB_inv

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [args.batch_size, 28 * 28, 196, 28 * 28]
    # number of layers
    n = len(fs) - 2

    # todo, move to more elegant backprop
    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(784, -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()
    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    losses = []

    covA = [None] * n
    covA_inv = [None] * n
    covB_inv = [None] * n

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)

    # TODO:
    # only do 2 passes like in eager mode
    # integrate with optimizer/same results
    # scale to deep autoencoder
    for step in range(10):
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        output = model(data)
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize

        loss.backward(retain_graph=True)
        backward.reverse()

        loss0 = loss.data[0]

        A = forward[:]
        B = backward[:]
        assert len(B) == n

        del forward[:]
        del backward[:]

        noise.normal_()
        synthetic_data = Variable(output.data + noise)

        err2 = output - synthetic_data
        loss2 = torch.sum(err2 * err2) / 2 / dsize
        optimizer.zero_grad()
        loss2.backward()
        B2 = backward[::-1]
        assert len(B2) == n

        # mode = 'kfac'

        # compute whitened gradient
        pre_dW = []
        for i in range(n):
            # only compute first activation once
            if i > 0:
                covA[i] = A[i] @ t(A[i]) / dsize
                covA_inv[i] = regularized_inverse(covA[i])
            else:
                if covA[i] is None:
                    covA[i] = A[i] @ t(A[i]) / dsize
                    covA_inv[i] = regularized_inverse(covA[i])

            #      else:
            covB2 = B2[i] @ t(B2[i]) / dsize
            covB = B[i] @ t(B[i]) / dsize  # todo: remove

            covB_inv[i] = regularized_inverse(covB2.data)

            whitened_A = covA_inv[i] @ A[i]
            whitened_B = covB_inv[i] @ B[i].data
            pre_dW.append(whitened_B @ t(whitened_A) / dsize)

        params = list(model.parameters())
        assert len(params) == len(pre_dW)
        for i in range(len(params)):
            params[i].data -= lr * pre_dW[i]

        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    loss0 = loss.data.cpu().numpy()  #[0]
    target = 2.360062122

    if 'Apple' in sys.version:
        target = 2.360126972
        target = 2.335654736  # after changing to torch.randn
    if args.cuda:
        target = 2.337174654
        target = 2.337215662  # switching to numpy inverse

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def rotations2_natural_sampled_kfac(num_samples=1):
    tf.reset_default_graph()
    np.random.seed(0)
    tf.set_random_seed(0)

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A2 = [0] * (n + 2)  # augmented forward props for natural gradient
    A[0] = u.Identity(dsize)
    A2[0] = u.Identity(dsize * num_samples)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))
        if i == 0:
            # replicate dataset multiple times corresponding to number of samples
            A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1)
        else:
            A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)

    # lower learning rate by 10x
    lr = tf.Variable(0.01, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B2 = [0] * (n + 1)
    B[n] = -err / dsize
    B2[n] = tf.random_normal((f(n), dsize * num_samples),
                             0,
                             1,
                             seed=0,
                             dtype=dtype)
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))
        B2[i] = tf.matmul(tf.transpose(W[i + 1]),
                          B2[i + 1],
                          name="B2" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    dW2 = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
        dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i))

    del dW[0]  # get rid of W[0] update
    del dW2[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # todo: divide both activations and backprops by size for cov calc

    # Kronecker factored covariance blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            if i == j:
                acov = A2[i] @ t(A2[j]) / (dsize * num_samples)
                bcov = B2[i] @ t(B2[j]) / (dsize * num_samples)
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)),
                                dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ifisher = u.concat_blocks(iblocks)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
def rotations2_natural_empirical():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.000001, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))

    del dW[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # inverse fisher preconditioner
    grads = tf.concat([u.khatri_rao(A[i], B[i]) for i in range(1, n + 1)],
                      axis=0)
    fisher = grads @ tf.transpose(grads) / dsize
    ifisher = u.pseudo_inverse(fisher)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #19
0
def benchmark(batch_size, iters, seed=1, cuda=True, history=100, verbose=False):
  global step, final_loss
  
  step = 0
  final_loss = None

  torch.manual_seed(seed)
  np.random.seed(seed)
  if cuda:
    torch.cuda.manual_seed(seed)

  visible_size = 28*28
  hidden_size = 196
  
  images = torch.Tensor(u.get_mnist_images(batch_size).T)
  images = images[:batch_size]
  if cuda:
    images = images.cuda()
  data = Variable(images)

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      self.encoder = nn.Parameter(torch.rand(visible_size, hidden_size))

    def forward(self, input):
      x = input.view(-1, visible_size)
      x = torch.sigmoid(torch.mm(x, self.encoder))
      x = torch.sigmoid(torch.mm(x, torch.transpose(self.encoder, 0, 1)))
      return x.view_as(input)

  # initialize model and weights
  model = Net()
  model.encoder.data = torch.Tensor(u.ng_init(visible_size,
                                              hidden_size))
  if cuda:
    model.cuda()
  
  model.train()
  optimizer = optim.LBFGS(model.parameters(), max_iter=iters, history_size=history, lr=1.0)

  times = []
  def closure():
    global step, final_loss
    optimizer.zero_grad()
    output = model(data)
    loss = F.mse_loss(output, data)
    if verbose:
      loss0 = loss.data[0]
      times.append(u.last_time())
      print("Step %3d loss %6.5f msec %6.3f"%(step, loss0, u.last_time()))
    step+=1
    if step == iters:
      final_loss = loss.data[0]
    loss.backward()
    u.record_time()
    return loss
  
  optimizer.step(closure)

  output = model(data)
  loss = F.mse_loss(output, data)
  loss0 = loss.data[0]

  if verbose:
    u.summarize_time()

    #  print(times)
  s = ','.join(["%f"%(n,) for n in times[2:]])
  print('{', s,'}')
  
  return final_loss
Exemple #20
0
def main():
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    logger = u.TensorboardLogger(args.run)

    with u.timeit("init/session"):

        rewrite_options = None
        try:
            from tensorflow.core.protobuf import rewriter_config_pb2
            rewrite_options = rewriter_config_pb2.RewriterConfig(
                disable_model_pruning=True,
                constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
                memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
        except:
            pass

        optimizer_options = tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0)
        graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                        rewrite_options=rewrite_options)
        gpu_options = tf.GPUOptions(allow_growth=False)
        config = tf.ConfigProto(graph_options=graph_options,
                                gpu_options=gpu_options,
                                log_device_placement=False)

        sess = tf.InteractiveSession(config=config)
        u.register_default_session(
            sess)  # since default session is Thread-local

    with u.timeit("init/model_init"):
        model = model_creator(args.batch_size, name="main")
        model.initialize_global_vars(verbose=True)
        model.initialize_local_vars()

    kfac_lib.numeric_inverse = args.numeric_inverse
    with u.timeit("init/kfac_init"):
        kfac = Kfac(model_creator, args.kfac_batch_size)
        kfac.model.initialize_global_vars(verbose=False)
        kfac.model.initialize_local_vars()
        kfac.Lambda.set(args.Lambda)
        kfac.reset()  # resets optimization variables (not model variables)

    if args.mode != 'run':
        opt = tf.train.AdamOptimizer(0.001)
    else:
        opt = tf.train.AdamOptimizer(args.lr)
    grads_and_vars = opt.compute_gradients(model.loss,
                                           var_list=model.trainable_vars)

    grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
    grad_new = kfac.correct(grad)
    with u.capture_vars() as adam_vars:
        train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
    with u.timeit("init/adam"):
        sessrun([v.initializer for v in adam_vars])

    losses = []
    u.record_time()

    start_time = time.time()
    vloss0 = 0

    # todo, unify the two data outputs
    outfn = 'data/%s_%f_%f.csv' % (args.run, args.lr, args.Lambda)

    start_time = time.time()
    if args.extra_kfac_batch_advance:
        kfac.model.advance_batch()  # advance kfac batch

    if args.kfac_async:
        kfac.start_stats_runners()

    for step in range(args.num_steps):

        if args.validate_every_n and step % args.validate_every_n == 0:
            loss0, vloss0 = sessrun([model.loss, model.vloss])
        else:
            loss0, = sessrun([model.loss])
        losses.append(loss0)  # TODO: remove this

        logger('loss/loss', loss0, 'loss/vloss', vloss0)

        elapsed = time.time() - start_time
        start_time = time.time()
        print("%4d ms, step %4d, loss %5.2f, vloss %5.2f" %
              (elapsed * 1e3, step, loss0, vloss0))

        if args.method == 'kfac' and not args.kfac_async:
            kfac.model.advance_batch()
            kfac.update_stats()

        with u.timeit("train"):
            model.advance_batch()
            with u.timeit("grad.update"):
                grad.update()
            with kfac.read_lock():
                grad_new.update()
            u.run(train_op)
            u.record_time()

        logger.next_step()

    # TODO: use u.global_runs_dir
    # TODO: get rid of u.timeit?

    with open('timelines/graphdef.txt', 'w') as f:
        f.write(str(u.get_default_graph().as_graph_def()))

    u.summarize_time()

    if args.mode == 'record':
        u.dump_with_prompt(losses, release_test_fn)

    elif args.mode == 'test':
        targets = np.loadtxt('data/' + release_test_fn, delimiter=",")
        u.check_equal(losses, targets, rtol=1e-2)
        u.summarize_difference(losses, targets)
        assert u.last_time() < 800, "Expected 648 on GTX 1080"
def main():
    global mode

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # feature sizes
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # number of layers
    n = len(fs) - 2

    matmul = kfac_matmul

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # W1 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10
            # W2 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10
            # self.W1 = nn.Parameter(torch.from_numpy(W1))
            # self.W2 = nn.Parameter(torch.from_numpy(W2))
            for i in range(1, n + 1):
                W0 = u.ng_init(fs[i + 1], fs[i])
                setattr(self, 'W' + str(i), nn.Parameter(torch.from_numpy(W0)))

        def forward(self, input):
            x = input.view(fs[1], -1)
            for i in range(1, n + 1):
                W = getattr(self, 'W' + str(i))
                x = nonlin(matmul(W, x))
            return x.view_as(input)

    model = Net()

    if args.cuda:
        model.cuda()

    data0 = u.get_mnist_images()
    data0 = data0[:, :dsize].astype(dtype)
    data = Variable(torch.from_numpy(data0))
    if args.cuda:
        data = data.cuda()

    model.train()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    noise = torch.Tensor(*data.data.shape).type(torch_dtype)
    covA_inv_saved = [None] * n

    for step in range(10):
        mode = 'standard'
        output = model(data)

        mode = 'capture'
        optimizer.zero_grad()
        del forward[:]
        del backward[:]
        del forward_inv[:]
        del backward_inv[:]
        noise.normal_()
        output_hat = Variable(output.data + noise)
        output = model(data)
        err_hat = output_hat - output
        loss_hat = torch.sum(err_hat * err_hat) / 2 / dsize
        loss_hat.backward(retain_graph=True)

        backward.reverse()
        forward.reverse()
        assert len(backward) == n
        assert len(forward) == n
        A = forward[:]
        B = backward[:]

        # compute inverses
        for i in range(n):
            # first layer doesn't change so only compute once
            if i == 0 and covA_inv_saved[i] is not None:
                covA_inv = covA_inv_saved[i]
            else:
                covA_inv = regularized_inverse(A[i] @ t(A[i]) / dsize)
                covA_inv_saved[i] = covA_inv
            forward_inv.append(covA_inv)

            covB_inv = regularized_inverse(B[i] @ t(B[i]) / dsize)
            backward_inv.append(covB_inv)

        mode = 'kfac'
        optimizer.zero_grad()
        err = output - data
        loss = torch.sum(err * err) / 2 / dsize
        loss.backward()
        optimizer.step()

        loss0 = loss.data.cpu().numpy()
        print("Step %3d loss %10.9f" % (step, loss0))
        u.record_time()

    if args.cuda:
        target = 2.337120533
    else:
        target = 2.335612774

    u.summarize_time()
    assert abs(loss0 - target) < 1e-9, abs(loss0 - target)
def main():
  global forward_list, backward_list, DO_PRINT
  
  torch.manual_seed(args.seed)
  np.random.seed(args.seed)
  if args.cuda:
    torch.cuda.manual_seed(args.seed)
  data0 = np.array([[0., 1], [2, 3]]).astype(dtype)
  data = Variable(torch.from_numpy(data0))

  class Net(nn.Module):
    def __init__(self):
      super(Net, self).__init__()
      W0 = (np.array([[0., 1], [2, 3]])).astype(dtype)/10
      W1 = (np.array([[4., 5], [6, 7]])).astype(dtype)/10
      self.W0 = nn.Parameter(torch.from_numpy(W0))
      self.W1 = nn.Parameter(torch.from_numpy(W1))

    def forward(self, input):
      x = input.view(-1, 2)
      x = nonlin(my_matmul(self.W0, x))
      x = nonlin(my_matmul(self.W1, x))
      return x.view_as(input)

  model = Net()
  if args.cuda:
    model.cuda()
  
  model.train()
  optimizer = optim.SGD(model.parameters(), lr=lr)
  losses = []
  for step in range(10):
    optimizer.zero_grad()
    forward_list = []
    backward_list = []
    output = model(data)
    err = output-data
    loss = torch.sum(err*err)/2/dsize
    loss.backward(retain_graph=True)
    loss0 = loss.data[0]

    A = forward_list[:]
    B = backward_list[::-1]
    forward_list = []
    backward_list = []
    
    noise = torch.from_numpy(np.random.randn(*data.data.shape).astype(dtype))
    synthetic_data = Variable(output.data+noise)
    err2 = output - synthetic_data
    loss2 = torch.sum(err2*err2)/2/dsize
    optimizer.zero_grad()
    backward_list = []
    loss2.backward()
    B2 = backward_list[::-1]


    # compute whitened gradient
    pre_dW = []
    n = len(A)
    assert len(B) == n
    assert len(B2) == n
    for i in range(n):
      covA = A[i] @ t(A[i])/dsize
      covB2 = B2[i]@t(B2[i])/dsize
      covB = B[i]@t(B[i])/dsize
      covA_inv = regularized_inverse(covA)
      whitened_A = regularized_inverse(covA)@A[i]
      whitened_B = regularized_inverse(covB2.data)@B[i].data
      pre_dW.append(whitened_B @ t(whitened_A)/dsize)

    params = list(model.parameters())
    assert len(params) == len(pre_dW)
    for i in range(len(params)):
      params[i].data-=lr*pre_dW[i]
    
    print("Step %3d loss %10.9f"%(step, loss0))
    u.record_time()

  target = 1.251557469
  assert abs(loss0-target)<1e-9, abs(loss0-target)
  u.summarize_time()
Exemple #23
0
def main():
    np.random.seed(0)
    tf.set_random_seed(0)

    dtype = np.float32

    train_images = u.get_mnist_images()

    dsize = 10000
    patches = train_images[:, :dsize].astype(dtype)
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    X0 = patches
    lambda_ = 3e-3
    rho = tf.constant(0.1, dtype=dtype)
    beta = 3
    W0_0 = u.ng_init(fs[2], fs[3])
    W1_0 = u.ng_init(fs[3], fs[2])
    W0f = u.flatten([W0_0.flatten(), W1_0.flatten()])

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    # helper to create variables with numpy or TF initial value
    init_dict = {}  # {var_placeholder: init_value}
    vard = {}  # {var: u.VarInfo}

    def init_var(val, name, trainable=False, noinit=False):
        if isinstance(val, tf.Tensor):
            collections = [] if noinit else None
            var = tf.Variable(val, name=name, collections=collections)
        else:
            val = np.array(val)
            assert u.is_numeric, "Unknown type"
            holder = tf.placeholder(dtype,
                                    shape=val.shape,
                                    name=name + "_holder")
            var = tf.Variable(holder, name=name, trainable=trainable)
            init_dict[holder] = val
        var_p = tf.placeholder(var.dtype, var.shape)
        var_setter = var.assign(var_p)
        vard[var] = u.VarInfo(var_setter, var_p)
        return var

    lr = init_var(0.2, "lr")

    Wf = init_var(W0f, "Wf", True)
    Wf_copy = init_var(W0f, "Wf_copy", True)
    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    X = init_var(X0, "X")
    W.insert(0, X)

    def sigmoid(x):
        return tf.sigmoid(x)

    def d_sigmoid(y):
        return y * (1 - y)

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)

    fail_node = tf.Print(0, [0], "fail, this must never run")
    with tf.control_dependencies([fail_node]):
        A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = backprops from sampled labels needed for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              noinit=True)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    pre_dW_stable = [None] * (n + 1)  # preconditioned stable dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    for i in range(1, n + 1):
        cov_op = A[i] @ t(A[i]) / dsize + lambda_ * u.Identity(A[i].shape[0])
        cov_A[i] = init_var(cov_op, "cov_A%d" % (i, ))
        cov_op = B2[i] @ t(B2[i]) / dsize + lambda_ * u.Identity(
            B2[i].shape[0])
        cov_B2[i] = init_var(cov_op, "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i],
                                     "svd_A_%d" % (i, ),
                                     do_inverses=True)
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],
                                      "svd_B2_%d" % (i, ),
                                      do_inverses=True)
        whitened_A = vars_svd_A[i].inv @ A[i]
        whitened_B = vars_svd_B2[i].inv @ B[i]
        pre_dW[i] = (whitened_B @ t(whitened_A)) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Loss function
    reconstruction = u.L2(err) / (2 * dsize)

    loss = reconstruction

    grad_live = u.flatten(dW[1:])
    pre_grad_live = u.flatten(pre_dW[1:])  # fisher preconditioned gradient
    grad = init_var(grad_live, "grad")
    pre_grad = init_var(pre_grad_live, "pre_grad")

    update_params_op = Wf.assign(Wf - lr * pre_grad).op
    save_params_op = Wf_copy.assign(Wf).op
    pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad)
    grad_norm = tf.reduce_sum(grad * grad)
    pre_grad_norm = u.L2(pre_grad)

    def dump_svd_info(step):
        """Dump singular values and gradient values in those coordinates."""
        for i in range(1, n + 1):
            svd = vars_svd_A[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "A_%d_%d" % (i, step))
            A0 = A[i].eval()
            At0 = v0.T @ A0
            u.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step))
            u.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step))
            u.dump(s0, "As_%d_%d" % (i, step))

        for i in range(1, n + 1):
            svd = vars_svd_B2[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            u.dump(s0, "B2_%d_%d" % (i, step))
            B0 = B[i].eval()
            Bt0 = v0.T @ B0
            u.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step))
            u.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step))
            u.dump(s0, "Bs_%d_%d" % (i, step))

    def advance_batch():
        sess.run(sampled_labels.initializer)  # new labels for next call

    def update_covariances():
        ops_A = [cov_A[i].initializer for i in range(1, n + 1)]
        ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)]
        sess.run(ops_A + ops_B2)

    def update_svds():
        vars_svd_A[2].update()
        vars_svd_B2[2].update()
        vars_svd_B2[1].update()

    def init_svds():
        """Initialize our SVD to identity matrices."""
        ops = []
        for i in range(1, n + 1):
            ops.extend(vars_svd_A[i].init_ops)
            ops.extend(vars_svd_B2[i].init_ops)
        sess = tf.get_default_session()
        sess.run(ops)

    init_op = tf.global_variables_initializer()

    from tensorflow.core.protobuf import rewriter_config_pb2

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                    rewrite_options=rewrite_options)
    config = tf.ConfigProto(graph_options=graph_options)

    sess = tf.InteractiveSession(config=config)
    sess.run(Wf.initializer, feed_dict=init_dict)
    sess.run(X.initializer, feed_dict=init_dict)
    advance_batch()
    update_covariances()
    init_svds()
    sess.run(init_op, feed_dict=init_dict)  # initialize everything else

    print("Running training.")
    u.reset_time()

    step_lengths = []  # keep track of learning rates
    losses = []

    # adaptive line search parameters
    alpha = 0.3  # acceptable fraction of predicted decrease
    beta = 0.8  # how much to shrink when violation
    growth_rate = 1.05  # how much to grow when too conservative

    def update_cov_A(i):
        sess.run(cov_A[i].initializer)

    def update_cov_B2(i):
        sess.run(cov_B2[i].initializer)

    # only update whitening matrix of input activations in the beginning
    vars_svd_A[1].update()

    for step in range(40):
        update_covariances()
        update_svds()

        sess.run(grad.initializer)
        sess.run(pre_grad.initializer)

        lr0, loss0 = sess.run([lr, loss])
        update_params_op.run()
        advance_batch()

        losses.append(loss0)
        step_lengths.append(lr0)

        print("Step %d loss %.2f" % (step, loss0))
        u.record_time()

    assert losses[-1] < 0.59
    assert losses[-1] > 0.57
    assert 20e-3 < min(
        u.global_time_list) < 50e-3, "Time should be 40ms on 1080"
    u.summarize_time()
    print("Test passed")
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #25
0
def main():
    np.random.seed(0)
    tf.set_random_seed(0)

    dtype = np.float32
    # 64-bit doesn't help much, search for 64-bit in
    # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4
    u.default_dtype = dtype
    machine_epsilon = np.finfo(dtype).eps  # 1e-7 or 1e-16
    train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte')
    dsize = 10000
    patches = train_images[:, :dsize]
    fs = [dsize, 28 * 28, 196, 28 * 28]

    # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    X0 = patches
    lambda_ = 3e-3
    rho = tf.constant(0.1, dtype=dtype)
    beta = 3
    W0f = W_uniform(fs[2], fs[3])

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    # helper to create variables with numpy or TF initial value
    init_dict = {}  # {var_placeholder: init_value}
    vard = {}  # {var: util.VarInfo}

    def init_var(val, name, trainable=False, noinit=False):
        if isinstance(val, tf.Tensor):
            collections = [] if noinit else None
            var = tf.Variable(val, name=name, collections=collections)
        else:
            val = np.array(val)
            assert u.is_numeric, "Unknown type"
            holder = tf.placeholder(dtype,
                                    shape=val.shape,
                                    name=name + "_holder")
            var = tf.Variable(holder, name=name, trainable=trainable)
            init_dict[holder] = val
        var_p = tf.placeholder(var.dtype, var.shape)
        var_setter = var.assign(var_p)
        vard[var] = u.VarInfo(var_setter, var_p)
        return var

    lr = init_var(0.2, "lr")
    if purely_linear:  # need lower LR without sigmoids
        lr = init_var(.02, "lr")

    Wf = init_var(W0f, "Wf", True)
    Wf_copy = init_var(W0f, "Wf_copy", True)
    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    X = init_var(X0, "X")
    W.insert(0, X)

    def sigmoid(x):
        if not purely_linear:
            return tf.sigmoid(x)
        else:
            return tf.identity(x)

    def d_sigmoid(y):
        if not purely_linear:
            return y * (1 - y)
        else:
            return 1

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)

    # A[0] is just for shape checks, assert fail on run
    # tf.assert always fails because of static assert
    # fail_node = tf.assert_equal(1, 0, message="too huge")
    fail_node = tf.Print(0, [0], "fail, this must never run")
    with tf.control_dependencies([fail_node]):
        A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = backprops from sampled labels needed for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              noinit=True)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        if i == 1 and not drop_sparsity:
            backprop += beta * d_kl(rho, rho_hat)
            backprop2 += beta * d_kl(rho, rho_hat)
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    pre_dW_stable = [None] * (n + 1)  # preconditioned stable dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    for i in range(1, n + 1):
        cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, ))
        cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ))
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ))
        if use_tikhonov:
            whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i]
        else:
            whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i]
        if use_tikhonov:
            whitened_B2 = u.regularized_inverse2(vars_svd_B2[i],
                                                 L=Lambda) @ B[i]
        else:
            whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i]
        whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i]
        whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i]
        pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize
        pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Loss function
    reconstruction = u.L2(err) / (2 * dsize)
    sparsity = beta * tf.reduce_sum(kl(rho, rho_hat))
    L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1]))

    loss = reconstruction
    if not drop_l2:
        loss = loss + L2
    if not drop_sparsity:
        loss = loss + sparsity

    grad_live = u.flatten(dW[1:])
    pre_grad_live = u.flatten(pre_dW[1:])  # fisher preconditioned gradient
    pre_grad_stable_live = u.flatten(
        pre_dW_stable[1:])  # sqrt fisher preconditioned grad
    grad = init_var(grad_live, "grad")
    pre_grad = init_var(pre_grad_live, "pre_grad")
    pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable")

    update_params_op = Wf.assign(Wf - lr * pre_grad).op
    update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op
    save_params_op = Wf_copy.assign(Wf).op
    pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad)
    pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad)
    grad_norm = tf.reduce_sum(grad * grad)
    pre_grad_norm = u.L2(pre_grad)
    pre_grad_stable_norm = u.L2(pre_grad_stable)

    def dump_svd_info(step):
        """Dump singular values and gradient values in those coordinates."""
        for i in range(1, n + 1):
            svd = vars_svd_A[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            util.dump(s0, "A_%d_%d" % (i, step))
            A0 = A[i].eval()
            At0 = v0.T @ A0
            util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step))
            util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step))
            util.dump(s0, "As_%d_%d" % (i, step))

        for i in range(1, n + 1):
            svd = vars_svd_B2[i]
            s0, u0, v0 = sess.run([svd.s, svd.u, svd.v])
            util.dump(s0, "B2_%d_%d" % (i, step))
            B0 = B[i].eval()
            Bt0 = v0.T @ B0
            util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step))
            util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step))
            util.dump(s0, "Bs_%d_%d" % (i, step))

    def advance_batch():
        sess.run(sampled_labels.initializer)  # new labels for next call

    def update_covariances():
        ops_A = [cov_A[i].initializer for i in range(1, n + 1)]
        ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)]
        sess.run(ops_A + ops_B2)

    def update_svds():
        if whitening_mode > 1:
            vars_svd_A[2].update()
        if whitening_mode > 2:
            vars_svd_B2[2].update()
        if whitening_mode > 3:
            vars_svd_B2[1].update()

    def init_svds():
        """Initialize our SVD to identity matrices."""
        ops = []
        for i in range(1, n + 1):
            ops.extend(vars_svd_A[i].init_ops)
            ops.extend(vars_svd_B2[i].init_ops)
        sess = tf.get_default_session()
        sess.run(ops)

    init_op = tf.global_variables_initializer()
    #  tf.get_default_graph().finalize()

    from tensorflow.core.protobuf import rewriter_config_pb2

    rewrite_options = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
    optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0)
    graph_options = tf.GraphOptions(optimizer_options=optimizer_options,
                                    rewrite_options=rewrite_options)
    config = tf.ConfigProto(graph_options=graph_options)
    #sess = tf.Session(config=config)
    sess = tf.InteractiveSession(config=config)
    sess.run(Wf.initializer, feed_dict=init_dict)
    sess.run(X.initializer, feed_dict=init_dict)
    advance_batch()
    update_covariances()
    init_svds()
    sess.run(init_op, feed_dict=init_dict)  # initialize everything else

    print("Running training.")
    u.reset_time()

    step_lengths = []  # keep track of learning rates
    losses = []
    ratios = []  # actual loss decrease / expected decrease
    grad_norms = []
    pre_grad_norms = []  # preconditioned grad norm squared
    pre_grad_stable_norms = []  # sqrt preconditioned grad norms squared
    target_delta_list = []  # predicted decrease linear approximation
    target_delta2_list = []  # predicted decrease quadratic appromation
    actual_delta_list = []  # actual decrease

    # adaptive line search parameters
    alpha = 0.3  # acceptable fraction of predicted decrease
    beta = 0.8  # how much to shrink when violation
    growth_rate = 1.05  # how much to grow when too conservative

    def update_cov_A(i):
        sess.run(cov_A[i].initializer)

    def update_cov_B2(i):
        sess.run(cov_B2[i].initializer)

    # only update whitening matrix of input activations in the beginning
    if whitening_mode > 0:
        vars_svd_A[1].update()

    # compute t(delta).H.delta/2
    def hessian_quadratic(delta):
        #    update_covariances()
        W = u.unflatten(delta, fs[1:])
        W.insert(0, None)
        total = 0
        for l in range(1, n + 1):
            decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l])
            total += decrement
        return (total / 2).eval()

    # compute t(delta).H^-1.delta/2
    def hessian_quadratic_inv(delta):
        #    update_covariances()
        W = u.unflatten(delta, fs[1:])
        W.insert(0, None)
        total = 0
        for l in range(1, n + 1):
            invB2 = u.pseudo_inverse2(vars_svd_B2[l])
            invA = u.pseudo_inverse2(vars_svd_A[l])
            decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA)
            total += decrement
        return (total / 2).eval()

    # do line search, dump values as csv
    def line_search(initial_value, direction, step, num_steps):
        saved_val = tf.Variable(Wf)
        sess.run(saved_val.initializer)
        pl = tf.placeholder(dtype, shape=(), name="linesearch_p")
        assign_op = Wf.assign(initial_value - direction * step * pl)
        vals = []
        for i in range(num_steps):
            sess.run(assign_op, feed_dict={pl: i})
            vals.append(loss.eval())
        sess.run(Wf.assign(saved_val))  # restore original value
        return vals

    for step in range(num_steps):
        update_covariances()
        if step % whiten_every_n_steps == 0:
            update_svds()

        sess.run(grad.initializer)
        sess.run(pre_grad.initializer)

        lr0, loss0 = sess.run([lr, loss])
        save_params_op.run()

        # regular inverse becomes unstable when grad norm exceeds 1
        stabilized_mode = grad_norm.eval() < 1

        if stabilized_mode and not use_tikhonov:
            update_params_stable_op.run()
        else:
            update_params_op.run()

        loss1 = loss.eval()
        advance_batch()

        # line search stuff
        target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else
                        -pre_grad_stable_dot_grad.eval())
        target_delta = lr0 * target_slope
        target_delta_list.append(target_delta)

        # second order prediction of target delta
        # TODO: the sign is wrong, debug this
        # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28
        if local_quadratics:
            x0 = Wf_copy.eval()
            x_opt = x0 - pre_grad.eval()
            # computes t(x)@H^-1 @(x)/2
            y_opt = loss0 - hessian_quadratic_inv(grad)
            # computes t(x)@H @(x)/2
            y_expected = hessian_quadratic(Wf - x_opt) + y_opt
            target_delta2 = y_expected - loss0
            target_delta2_list.append(target_delta2)

        actual_delta = loss1 - loss0
        actual_slope = actual_delta / lr0
        slope_ratio = actual_slope / target_slope  # between 0 and 1.01
        actual_delta_list.append(actual_delta)

        if do_line_search:
            vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40)
            vals2 = line_search(Wf_copy, grad, lr / 100, 40)
            u.dump(vals1, "line1-%d" % (i, ))
            u.dump(vals2, "line2-%d" % (i, ))

        losses.append(loss0)
        step_lengths.append(lr0)
        ratios.append(slope_ratio)
        grad_norms.append(grad_norm.eval())
        pre_grad_norms.append(pre_grad_norm.eval())
        pre_grad_stable_norms.append(pre_grad_stable_norm.eval())

        if step % report_frequency == 0:
            print(
                "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"
                % (step, loss0, target_delta, actual_delta, slope_ratio,
                   grad_norm.eval(), pre_grad_norm.eval()))

        if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in:
            # shrink if wrong prediction, don't shrink if prediction is tiny
            if slope_ratio < alpha and abs(
                    target_delta) > 1e-6 and adaptive_step:
                print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio))
                print(
                    "Slope optimality %.2f, shrinking learning rate to %.2f" %
                    (
                        slope_ratio,
                        lr0 * beta,
                    ))
                sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta})

            # grow learning rate, slope_ratio .99 worked best for gradient
            elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step:
                print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio))
                print("Growing learning rate to %.2f" % (lr0 * growth_rate))
                sess.run(vard[lr].setter,
                         feed_dict={vard[lr].p: lr0 * growth_rate})

        u.record_time()

    # check against expected loss
    if 'Apple' in sys.version:
        pass
        #    u.dump(losses, "kfac_small_final_mac.csv")
        targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",")
    else:
        pass
        #    u.dump(losses, "kfac_small_final_linux.csv")
        targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",")

    u.check_equal(targets, losses[:len(targets)], rtol=1e-1)
    u.summarize_time()
    print("Test passed")
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Exemple #27
0
def main():
  np.random.seed(args.seed)
  tf.set_random_seed(args.seed)

  logger = u.TensorboardLogger(args.run)
  
  with u.timeit("init/session"):
    gpu_options = tf.GPUOptions(allow_growth=False)
    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
    u.register_default_session(sess)   # since default session is Thread-local

  with u.timeit("init/model_init"):
    model = model_creator(args.batch_size, name="main")
    model.initialize_global_vars(verbose=True)
    model.initialize_local_vars()
  
  with u.timeit("init/kfac_init"):
    kfac = Kfac(model_creator, args.kfac_batch_size) 
    kfac.model.initialize_global_vars(verbose=False)
    kfac.model.initialize_local_vars()
    kfac.Lambda.set(args.Lambda)
    kfac.reset()    # resets optimization variables (not model variables)

  if args.mode != 'run':
    opt = tf.train.AdamOptimizer(0.001)
  else:
    opt = tf.train.AdamOptimizer(args.lr)
  grads_and_vars = opt.compute_gradients(model.loss,
                                         var_list=model.trainable_vars)
      
  grad = IndexedGrad.from_grads_and_vars(grads_and_vars)
  grad_new = kfac.correct(grad)
  with u.capture_vars() as adam_vars:
    train_op = opt.apply_gradients(grad_new.to_grads_and_vars())
  with u.timeit("init/adam"):
    sessrun([v.initializer for v in adam_vars])
  
  losses = []
  u.record_time()

  start_time = time.time()
  vloss0 = 0

  # todo, unify the two data outputs
  outfn = 'data/%s_%f_%f.csv'%(args.run, args.lr, args.Lambda)
  writer = u.BufferedWriter(outfn, 60)   # get rid?

  start_time = time.time()
  if args.extra_kfac_batch_advance:
    kfac.model.advance_batch()  # advance kfac batch

  if args.kfac_async:
    kfac.start_stats_runners()
    
  for step in range(args.num_steps):
    
    if args.validate_every_n and step%args.validate_every_n == 0:
      loss0, vloss0 = sessrun([model.loss, model.vloss])
    else:
      loss0, = sessrun([model.loss])
    losses.append(loss0)  # TODO: remove this

    logger('loss/loss', loss0, 'loss/vloss', vloss0)
    
    elapsed = time.time()-start_time
    print("%d sec, step %d, loss %.2f, vloss %.2f" %(elapsed, step, loss0,
                                                     vloss0))
    writer.write('%d, %f, %f, %f\n'%(step, elapsed, loss0, vloss0))

    if args.method=='kfac' and not args.kfac_async:
      kfac.model.advance_batch()
      kfac.update_stats()

    with u.timeit("train"):
      model.advance_batch()
      grad.update()
      with kfac.read_lock():
        grad_new.update()
      train_op.run()
      u.record_time()

    logger.next_step()

  # TODO: use u.global_runs_dir
  # TODO: get rid of u.timeit?
  
  with open('timelines/graphdef.txt', 'w') as f:
    f.write(str(u.get_default_graph().as_graph_def()))

  u.summarize_time()
  
  if args.mode == 'record':
    u.dump_with_prompt(losses, release_test_fn)

  elif args.mode == 'test':
    targets = np.loadtxt('data/'+release_test_fn, delimiter=",")
    u.check_equal(losses, targets, rtol=1e-2)
    u.summarize_difference(losses, targets)