Example #1
0
    def testUpdateClipCoeff(self):
        with ops.Graph().as_default(), self.cached_session() as sess:
            grads_and_vars = [(array_ops.constant([[1., 2.], [3., 4.]]), None),
                              (array_ops.constant([[2., 3.], [4., 5.]]), None)]
            pgrads_and_vars = [(array_ops.constant([[3., 4.], [5.,
                                                               6.]]), None),
                               (array_ops.constant([[7., 8.], [9.,
                                                               10.]]), None)]
            lrate = 0.1

            # Note: without rescaling, the squared Fisher norm of the update
            # is 1.74

            # If the update already satisfies the norm constraint, there should
            # be no rescaling.
            opt = optimizer.KfacOptimizer(lrate,
                                          0.2,
                                          0.3,
                                          dummy_layer_collection(),
                                          norm_constraint=10.)
            coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars)
            self.assertAlmostEqual(1., sess.run(coeff), places=5)

            # If the update violates the constraint, it should be rescaled to
            # be on the constraint boundary.
            opt = optimizer.KfacOptimizer(lrate,
                                          0.2,
                                          0.3,
                                          dummy_layer_collection(),
                                          norm_constraint=0.5)
            coeff = opt._update_clip_coeff(grads_and_vars, pgrads_and_vars)
            sq_norm_pgrad = opt._squared_fisher_norm(grads_and_vars,
                                                     pgrads_and_vars)
            sq_norm_update = lrate**2 * coeff**2 * sq_norm_pgrad
            self.assertAlmostEqual(0.5, sess.run(sq_norm_update), places=5)
Example #2
0
    def testOptimizerInit(self):
        with ops.Graph().as_default():
            layer_collection = lc.LayerCollection()

            inputs = array_ops.ones((2, 1)) * 2
            weights_val = np.ones((1, 1), dtype=np.float32) * 3.
            weights = variable_scope.get_variable(
                'w', initializer=array_ops.constant(weights_val))
            bias = variable_scope.get_variable(
                'b', initializer=init_ops.zeros_initializer(), shape=(1, 1))
            output = math_ops.matmul(inputs, weights) + bias

            layer_collection.register_fully_connected((weights, bias), inputs,
                                                      output)

            logits = math_ops.tanh(output)
            targets = array_ops.constant([[0.], [1.]])
            output = math_ops.reduce_mean(
                nn.softmax_cross_entropy_with_logits(logits=logits,
                                                     labels=targets))

            layer_collection.register_categorical_predictive_distribution(
                logits)

            optimizer.KfacOptimizer(0.1,
                                    0.2,
                                    0.3,
                                    layer_collection,
                                    momentum=0.5,
                                    momentum_type='regular')
Example #3
0
 def testOptimizerInitInvalidMomentumRegistration(self):
     with self.assertRaises(ValueError):
         optimizer.KfacOptimizer(0.1,
                                 0.2,
                                 0.3,
                                 lc.LayerCollection(),
                                 momentum_type='foo')
Example #4
0
def minimize_loss_single_machine(loss,
                                 accuracy,
                                 layer_collection,
                                 device="/gpu:0",
                                 session_config=None):
    # Train with K-FAC.
    g_step = tf.train.get_or_create_global_step()
    optimizer = opt.KfacOptimizer(learning_rate=0.0001,
                                  cov_ema_decay=0.95,
                                  damping=0.001,
                                  layer_collection=layer_collection,
                                  placement_strategy="round_robin",
                                  cov_devices=[device],
                                  inv_devices=[device],
                                  momentum=0.9)

    mnist = input_data.read_data_sets('MNIST_data',
                                      reshape=False,
                                      one_hot=False)
    sample = mnist.validation.images
    la = mnist.validation.labels

    (cov_update_thunks,
     inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
    sess = tf.InteractiveSession()

    def make_update_op(update_thunks):
        update_ops = [thunk() for thunk in update_thunks]
        return update_ops
        # return tf.group(*update_ops)

    sess.run(tf.global_variables_initializer())
    cov_update_op = make_update_op(cov_update_thunks)
    for i in range(200):
        accuracy_ = sess.run(cov_update_op,
                             feed_dict={
                                 example: sample[i:i + 1],
                                 labels: la[i:i + 1]
                             })

    # with tf.control_dependencies([cov_update_op]):
    #   inverse_op = tf.cond(
    #       tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
    #       lambda: make_update_op(inv_update_thunks), tf.no_op)
    #   with tf.control_dependencies([inverse_op]):
    #     with tf.device(device):
    #       train_op = optimizer.minimize(loss, global_step=g_step)

    # tf.logging.info("Starting training.")
    # with tf.train.MonitoredTrainingSession(config=session_config) as sess:
    #   while not sess.should_stop():
    #     # global_step_, loss_, accuracy_, _ = sess.run(
    #     #     [g_step, loss, accuracy, train_op])
    #     accuracy_ = sess.run(cov_update_op)

    # if global_step_ % _INVERT_EVERY == 0:
    #   tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
    #                   global_step_, loss_, accuracy_)

    return accuracy_
Example #5
0
def minimize_loss_single_machine(loss,
                                 accuracy,
                                 layer_collection,
                                 device="/gpu:0",
                                 session_config=None):
  """Minimize loss with K-FAC on a single machine.

  A single Session is responsible for running all of K-FAC's ops. The covariance
  and inverse update ops are placed on `device`. All model variables are on CPU.

  Args:
    loss: 0-D Tensor. Loss to be minimized.
    accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
    layer_collection: LayerCollection instance describing model architecture.
      Used by K-FAC to construct preconditioner.
    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
      update ops are run on this device.
    session_config: None or tf.ConfigProto. Configuration for tf.Session().

  Returns:
    final value for 'accuracy'.
  """
  # Train with K-FAC.
  g_step = tf.train.get_or_create_global_step()
  optimizer = opt.KfacOptimizer(
      learning_rate=0.0001,
      cov_ema_decay=0.95,
      damping=0.001,
      layer_collection=layer_collection,
      placement_strategy="round_robin",
      cov_devices=[device],
      inv_devices=[device],
      momentum=0.9)
  (cov_update_thunks,
   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()

  def make_update_op(update_thunks):
    update_ops = [thunk() for thunk in update_thunks]
    return tf.group(*update_ops)

  cov_update_op = make_update_op(cov_update_thunks)
  with tf.control_dependencies([cov_update_op]):
    inverse_op = tf.cond(
        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
        lambda: make_update_op(inv_update_thunks), tf.no_op)
    with tf.control_dependencies([inverse_op]):
      with tf.device(device):
        train_op = optimizer.minimize(loss, global_step=g_step)

  tf.logging.info("Starting training.")
  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
    while not sess.should_stop():
      global_step_, loss_, accuracy_, _ = sess.run(
          [g_step, loss, accuracy, train_op])

      if global_step_ % _INVERT_EVERY == 0:
        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                        global_step_, loss_, accuracy_)

  return accuracy_
Example #6
0
def _make_distributed_train_op(task_id, num_worker_tasks, num_ps_tasks,
                               layer_collection):
    """Creates optimizer and distributed training op.

  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
  the train op.

  Args:
   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
    num_worker_tasks: int. Number of workers in this distributed training setup.
    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
      parameter servers are not used.
    layer_collection: LayerCollection instance describing model architecture.
      Used by K-FAC to construct preconditioner.

  Returns:
    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
      optimizer.
    optimizer: Instance of `opt.KfacOptimizer`.
    global_step: `tensor`, Global step.
  """
    tf.logging.info("Task id : %d", task_id)
    with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
        global_step = tf.train.get_or_create_global_step()
        optimizer = opt.KfacOptimizer(learning_rate=0.0001,
                                      cov_ema_decay=0.95,
                                      damping=0.001,
                                      layer_collection=layer_collection,
                                      momentum=0.9)
        sync_optimizer = tf.train.SyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
            total_num_replicas=num_worker_tasks)
        return sync_optimizer, optimizer, global_step
Example #7
0
 def testSquaredFisherNorm(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
         grads_and_vars = [(array_ops.constant([[1., 2.], [3., 4.]]), None),
                           (array_ops.constant([[2., 3.], [4., 5.]]), None)]
         pgrads_and_vars = [(array_ops.constant([[3., 4.], [5.,
                                                            6.]]), None),
                            (array_ops.constant([[7., 8.], [9.,
                                                            10.]]), None)]
         opt = optimizer.KfacOptimizer(0.1, 0.2, 0.3,
                                       dummy_layer_collection())
         sq_norm = opt._squared_fisher_norm(grads_and_vars, pgrads_and_vars)
         self.assertAlmostEqual(174., sess.run(sq_norm), places=5)
Example #8
0
    def testApplyGradients(self):
        with ops.Graph().as_default(), self.cached_session() as sess:
            layer_collection = lc.LayerCollection()

            inputs = array_ops.ones((2, 1)) * 2
            weights_val = np.ones((1, 1), dtype=np.float32) * 3.
            weights = variable_scope.get_variable(
                'w', initializer=array_ops.constant(weights_val))
            bias = variable_scope.get_variable(
                'b', initializer=init_ops.zeros_initializer(), shape=(1, 1))
            output = math_ops.matmul(inputs, weights) + bias

            layer_collection.register_fully_connected((weights, bias), inputs,
                                                      output)

            logits = math_ops.tanh(output)
            targets = array_ops.constant([[0.], [1.]])
            output = math_ops.reduce_mean(
                nn.softmax_cross_entropy_with_logits(logits=logits,
                                                     labels=targets))

            layer_collection.register_categorical_predictive_distribution(
                logits)

            opt = optimizer.KfacOptimizer(0.1,
                                          0.2,
                                          0.3,
                                          layer_collection,
                                          momentum=0.5,
                                          momentum_type='regular')
            (cov_update_thunks,
             inv_update_thunks) = opt.make_vars_and_create_op_thunks()
            cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
            inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)

            grads_and_vars = opt.compute_gradients(output, [weights, bias])
            all_vars = [grad_and_var[1] for grad_and_var in grads_and_vars]

            op = opt.apply_gradients(grads_and_vars)

            sess.run(tf_variables.global_variables_initializer())
            old_vars = sess.run(all_vars)
            sess.run(cov_update_ops)
            sess.run(inv_update_ops)
            sess.run(op)
            new_vars = sess.run(all_vars)

            for old_var, new_var in zip(old_vars, new_vars):
                self.assertNotEqual(old_var, new_var)
    def testUpdateVelocities(self):
        with ops.Graph().as_default(), self.test_session() as sess:
            layers = lc.LayerCollection()
            layers.losses = [
                lf.CategoricalLogitsNegativeLogProbLoss(
                    array_ops.constant([1.0]))
            ]
            opt = optimizer.KfacOptimizer(0.1,
                                          0.2,
                                          0.3,
                                          layers,
                                          momentum=0.5,
                                          momentum_type='regular')
            x = variable_scope.get_variable('x',
                                            initializer=array_ops.ones((2, 2)))
            y = variable_scope.get_variable('y',
                                            initializer=array_ops.ones(
                                                (2, 2)) * 2)
            vec1 = array_ops.ones((2, 2)) * 3
            vec2 = array_ops.ones((2, 2)) * 4

            model_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
            update_op = opt._update_velocities([(vec1, x), (vec2, y)], 0.5)
            opt_vars = [
                v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
                if v not in model_vars
            ]

            sess.run(tf_variables.global_variables_initializer())
            old_opt_vars = sess.run(opt_vars)

            # Optimizer vars start out at 0.
            for opt_var in old_opt_vars:
                self.assertAllEqual(sess.run(array_ops.zeros_like(opt_var)),
                                    opt_var)

            sess.run(update_op)
            new_opt_vars = sess.run(opt_vars)
            # After one update, the velocities are equal to the vectors.
            for vec, opt_var in zip([vec1, vec2], new_opt_vars):
                self.assertAllEqual(sess.run(vec), opt_var)

            sess.run(update_op)
            final_opt_vars = sess.run(opt_vars)
            for first, second in zip(new_opt_vars, final_opt_vars):
                self.assertFalse(np.equal(first, second).all())
Example #10
0
def build_model(examples, labels, num_labels, layer_collection, device):
    """Builds a ConvNet classification model.

  Args:
    examples: Tensor of shape [num_examples, num_features]. Represents inputs of
      model.
    labels: Tensor of shape [num_examples]. Contains integer IDs to be predicted
      by softmax for each example.
    num_labels: int. Number of distinct values 'labels' can take on.
    layer_collection: LayerCollection instance. Layers will be registered here.

  Returns:
    loss: 0-D Tensor representing loss to be minimized.
    accuracy: 0-D Tensor representing model's accuracy.
  """
    # Build a ConvNet. For each layer with parameters, we'll keep track of the
    # preactivations, activations, weights, and bias.
    tf.logging.info("Building model.")
    pre0, act0, params0 = conv_layer(layer_id=0,
                                     inputs=examples,
                                     kernel_size=3,
                                     out_channels=32)

    # act1 = max_pool_layer(layer_id=1, inputs=act0, kernel_size=2, stride=2)

    pre2, act2, params2 = conv_layer(layer_id=2,
                                     inputs=act0,
                                     kernel_size=3,
                                     out_channels=64)

    # act3 = max_pool_layer(layer_id=6, inputs=act2, kernel_size=2, stride=2)

    pre4, act4, params4 = conv_layer(layer_id=3,
                                     inputs=act2,
                                     kernel_size=3,
                                     out_channels=64)

    flat_act5 = tf.reshape(act4, shape=[-1, int(np.prod(act4.shape[1:4]))])

    pre5, act5, params5 = linear_layer(layer_id=4,
                                       inputs=flat_act5,
                                       output_size=512)

    logits, _, params6 = linear_layer(layer_id=5,
                                      inputs=act5,
                                      output_size=num_labels)
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                       logits=logits))
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))

    val_list = list(params0 + params2 + params4 + params5 + params6)

    with tf.device("/cpu:0"):
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)

    # Register parameters. K-FAC needs to know about the inputs, outputs, and
    # parameters of each conv/fully connected layer and the logits powering the
    # posterior probability over classes.
    tf.logging.info("Building LayerCollection.")
    layer_collection.register_conv2d(params0, (1, 1, 1, 1), "VALID", examples,
                                     pre0)
    layer_collection.register_conv2d(params2, (1, 1, 1, 1), "VALID", act0,
                                     pre2)
    layer_collection.register_conv2d(params4, (1, 1, 1, 1), "VALID", act2,
                                     pre4)
    layer_collection.register_fully_connected(params5, flat_act5, pre5)
    layer_collection.register_fully_connected(params6, act5, logits)
    layer_collection.register_categorical_predictive_distribution(
        logits, name="logits")

    g_step = tf.train.get_or_create_global_step()
    optimizer = opt.KfacOptimizer(learning_rate=0.0001,
                                  cov_ema_decay=0.95,
                                  damping=0.001,
                                  layer_collection=layer_collection,
                                  placement_strategy="round_robin",
                                  cov_devices=[device],
                                  inv_devices=[device],
                                  momentum=0.9)

    data = np.load('distillation_data/simple_random_3.npz')
    observation = data['observation'][:6000]

    (cov_update_thunks,
     inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
    sess = tf.InteractiveSession()

    def make_update_op(update_thunks):
        update_ops = [thunk() for thunk in update_thunks]
        return update_ops
        # return tf.group(*update_ops)

    cov_update_op = make_update_op(cov_update_thunks)
    # train_op = optimizer.minimize(loss, global_step=g_step)

    sess.run(tf.global_variables_initializer())
    param = joblib.load('initial_parameter/420000')
    for i in range(10):
        sess.run(val_list[i].assign(param[i]))

    F_accum = []
    num_sample = observation.shape[0]
    print(num_sample)
    for i in range(num_sample):
        F = sess.run(cov_update_op, feed_dict={examples: observation[i:i + 1]})
        for index in range(len(F)):
            if i == 0:
                F_accum.append(F[index])
            else:
                F_accum[index] += F[index]
    for i in range(len(F_accum)):
        F_accum[i] /= num_sample
    joblib.dump(F_accum, 'fisher_matrix_tf/simple_agent_3_random_6000')
    return
Example #11
0
def train_mnist_multitower(data_dir, num_epochs, num_towers,
                           use_fake_data=True, devices=None):
  """Train a ConvNet on MNIST.

  Training data is split equally among the towers. Each tower computes loss on
  its own batch of data and the loss is aggregated on the CPU. The model
  variables are placed on first tower. The covariance and inverse update ops
  and variables are placed on GPUs in a round robin manner.

  Args:
    data_dir: string. Directory to read MNIST examples from.
    num_epochs: int. Number of passes to make over the training set.
    num_towers: int. Number of CPUs to split inference across.
    use_fake_data: bool. If True, generate a synthetic dataset.
    devices: string, Either list of CPU or GPU. The covaraince and inverse
      update ops are run on this device.

  Returns:
    accuracy of model on the final minibatch of training data.
  """
  if devices:
    device_count = {"GPU": num_towers}
  else:
    device_count = {"CPU": num_towers}

  devices = devices or [
      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
  ]
  # Load a dataset.
  tf.logging.info("Loading MNIST into memory.")
  tower_batch_size = 128
  batch_size = tower_batch_size * num_towers
  tf.logging.info(
      ("Loading MNIST into memory. Using batch_size = %d = %d towers * %d "
       "tower batch size.") % (batch_size, num_towers, tower_batch_size))
  examples, labels = mnist.load_mnist(
      data_dir,
      num_epochs=num_epochs,
      batch_size=batch_size,
      use_fake_data=use_fake_data,
      flatten_images=False)

  # Split minibatch across towers.
  examples = tf.split(examples, num_towers)
  labels = tf.split(labels, num_towers)

  # Build an MLP. Each tower's layers will be added to the LayerCollection.
  layer_collection = lc.LayerCollection()
  tower_results = []
  for tower_id in range(num_towers):
    with tf.device(devices[tower_id]):
      with tf.name_scope("tower%d" % tower_id):
        with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
          tf.logging.info("Building tower %d." % tower_id)
          tower_results.append(
              build_model(examples[tower_id], labels[tower_id], 10,
                          layer_collection))
  losses, accuracies = zip(*tower_results)

  # Average across towers.
  loss = tf.reduce_mean(losses)
  accuracy = tf.reduce_mean(accuracies)

  # Fit model.

  session_config = tf.ConfigProto(
      allow_soft_placement=False,
      device_count=device_count,
  )

  g_step = tf.train.get_or_create_global_step()
  optimizer = opt.KfacOptimizer(
      learning_rate=0.0001,
      cov_ema_decay=0.95,
      damping=0.001,
      layer_collection=layer_collection,
      placement_strategy="round_robin",
      cov_devices=devices,
      inv_devices=devices,
      momentum=0.9)
  (cov_update_thunks,
   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()

  def make_update_op(update_thunks):
    update_ops = [thunk() for thunk in update_thunks]
    return tf.group(*update_ops)

  cov_update_op = make_update_op(cov_update_thunks)
  with tf.control_dependencies([cov_update_op]):
    inverse_op = tf.cond(
        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
        lambda: make_update_op(inv_update_thunks), tf.no_op)
    with tf.control_dependencies([inverse_op]):
      train_op = optimizer.minimize(loss, global_step=g_step)

  tf.logging.info("Starting training.")
  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
    while not sess.should_stop():
      global_step_, loss_, accuracy_, _ = sess.run(
          [g_step, loss, accuracy, train_op])

      if global_step_ % _INVERT_EVERY == 0:
        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                        global_step_, loss_, accuracy_)