Example #1
0
 def finalize(self):
     if context.executing_eagerly():
         # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
         raise NotImplementedError(
             'Eager mode not supported in TPUStrategy.')
     else:
         return [tpu.shutdown_system()]
Example #2
0
 def _run_tpu_initialization(self):
     """Test TPU system initialization."""
     with tf.Session('grpc://{0}:8470'.format(self.tpu_ip)) as sess:
         sess.run(tpu.initialize_system())
         sess.run(tpu.shutdown_system())
         logging.info('Successfully initialized and shutdown the tpu')
     self.tpu_initialization = 'Passed'
Example #3
0
  def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
                                    renorm, is_tpu,
                                    update_ops_in_cross_tower_mode):
    """Verifies that moving mean updates are reduced across towers."""
    with distribution.scope():
      num_towers = len(distribution.worker_devices)
      model_fn, dataset_fn, batchnorm = batchnorm_example(
          optimizer_fn,
          batch_per_epoch=num_towers,
          momentum=momentum,
          renorm=renorm,
          update_ops_in_tower_mode=not update_ops_in_cross_tower_mode)

      # Make sure prefetching is disabled since that makes the
      # specific input on each device to be non deterministic, and
      # this test relies on specific input being on each device.
      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
        self.assertFalse(distribution._prefetch_on_device)
      iterator = distribution.distribute_dataset(
          dataset_fn).make_one_shot_iterator()

      def run_step():
        fetches = distribution.unwrap(
            distribution.call_for_each_tower(
                model_fn, iterator.get_next(),
                run_concurrently=batchnorm.built))
        if update_ops_in_cross_tower_mode:
          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
        return control_flow_ops.group(fetches)

      if not context.executing_eagerly():
        with self.test_session() as sess:
          if is_tpu:
            sess.run(tpu.initialize_system())
          run_step = sess.make_callable(run_step())
        self.evaluate(variables_lib.global_variables_initializer())

      expected_moving_means = [0.] * 8

      def averaged_batch_mean(i):
        # Each batch has shape [16, 8] where the ith element in jth list is
        # (8 * j + i + tower_id * 100). So the batch mean in each tower is
        # (60 + i + tower_id * 100). So here comes its batch mean over all
        # towers:
        return 60. + i + (num_towers - 1.) / 2. * 100.

      for _ in range(10):
        run_step()
        moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean))

        # We make sure that the moving_mean is updated as if the sample mean is
        # calculated over all towers.
        for i, expected_moving_mean in enumerate(expected_moving_means):
          expected_moving_means[i] -= ((
              expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
          self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)

      if is_tpu:
        with self.test_session() as sess:
          sess.run(tpu.shutdown_system())
Example #4
0
    def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                         is_tpu):
        with distribution.scope():
            model_fn, dataset, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=use_callable_loss)

            # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
            # `DistributionStrategy.create_monitor` so that each DistributionStrategy
            # could influence its training loop. That method would return an instance
            # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
            # tpu.shutdown_system().
            if is_tpu:
                dataset = dataset.batch(2)

            iterator = distribution.distribute_dataset(dataset)

            def run_step():
                # TODO(isaprykin): Make iterator get_next() return a list of sub-
                # batches for each iteration. Pass iterator.get_next() and not iterator
                # to call_for_each_tower.
                return distribution.group(
                    distribution.call_for_each_tower(
                        model_fn,
                        iterator.get_next() if not is_tpu else iterator,
                        run_concurrently=layer.built))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    if is_tpu:
                        sess.run(tpu.initialize_system())
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            weights, biases = [], []
            for _ in range(10):
                run_step()

                weights.append(self.evaluate(distribution.fetch(layer.kernel)))
                biases.append(self.evaluate(distribution.fetch(layer.bias)))

            if is_tpu:
                with self.test_session() as sess:
                    sess.run(tpu.shutdown_system())

            error = abs(
                numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
            is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
            self.assertTrue(is_not_increasing)
def shutdown_tpu_session(session=None):
    """Shutdown the TPU attached to session.

  This should be called to cleanly shut down the TPU system before the client
  exits.

  Args:
    session: Session to shutdown, or None to use the default session.

  Returns:

  """
    if session is None:
        session = K.get_session()

    session.run(tpu.shutdown_system())
Example #6
0
def shutdown_tpu_session(session=None):
  """Shutdown the TPU attached to session.

  This should be called to cleanly shut down the TPU system before the client
  exits.

  Args:
    session: Session to shutdown, or None to use the default session.

  Returns:

  """
  if session is None:
    session = K.get_session()

  session.run(tpu.shutdown_system())
  def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                       is_tpu):
    # TODO(priyag): Remove this once the step TPU Strategy is stable.
    if is_tpu:
      self.skipTest("TPU tests are WIP.")

    with distribution.scope():
      model_fn, dataset_fn, layer = minimize_loss_example(
          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)

      # TODO(isaprykin):  Eliminate `is_tpu`. Probably add a
      # `DistributionStrategy.create_monitor` so that each DistributionStrategy
      # could influence its training loop. That method would return an instance
      # of Monitor.  TPUMonitor would execute tpu.initialize_system() and
      # tpu.shutdown_system().
      iterator = distribution.distribute_dataset(
          dataset_fn).make_one_shot_iterator()

      def run_step():
        return distribution.group(
            distribution.call_for_each_tower(
                model_fn, iterator.get_next(), run_concurrently=layer.built))

      if not context.executing_eagerly():
        with self.test_session() as sess:
          if is_tpu:
            sess.run(tpu.initialize_system())
          run_step = sess.make_callable(run_step())
        self.evaluate(variables_lib.global_variables_initializer())

      weights, biases = [], []
      for _ in range(10):
        run_step()

        weights.append(self.evaluate(layer.kernel))
        biases.append(self.evaluate(layer.bias))

      if is_tpu:
        with self.test_session() as sess:
          sess.run(tpu.shutdown_system())

      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
      self.assertTrue(is_not_increasing)
Example #8
0
  def run_on_device(self, model_fn, model_inputs, device):
    """Runs `model_fn` on the given device.

    Raises an exception if no such device is available.  `model_fn` should
    return one or more tensors as a list or tuple.

    Args:
      model_fn: Function returning one or more tensors.
      model_inputs: An iterable of Numpy arrays or scalars.
                    These will be passed as arguments to `model_fn`.
      device: Device to run on.  One of ("tpu", "gpu", "cpu").

    Returns:
      Output from the model function.
    """
    def _make_placeholders():
      return dict(
          [(gen_array_ops.placeholder_with_default(v, v.shape), v)
           for v in model_inputs])

    if device == "tpu":
      with self.test_session(graph=ops.Graph()) as sess:
        placeholders = _make_placeholders()
        tpu_computation = tpu.rewrite(model_fn, placeholders.keys())
        sess.run(tpu.initialize_system())
        sess.run(variables.global_variables_initializer())
        result = sess.run(tpu_computation, placeholders)
        sess.run(tpu.shutdown_system())
        # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite().
        if len(result) == 1:
          return result[0]
        return result
    elif device == "gpu":
      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
        placeholders = _make_placeholders()
        sess.run(variables.global_variables_initializer())
        return sess.run(model_fn(placeholders.keys()), placeholders)
    elif device == "cpu":
      # TODO(power) -- will this interact poorly with cached GPU sessions?
      with self.test_session(graph=ops.Graph(), use_gpu=False) as sess:
        placeholders = _make_placeholders()
        sess.run(variables.global_variables_initializer())
        return sess.run(model_fn(placeholders.keys()), placeholders)
Example #9
0
    def _run_tpu_computation(self):
        """Attempt to run computation graph directly on TPU."""
        def _computation_fn(alpha, x, y):
            return alpha * x + y

        alpha = tf.Variable(3.0, name='alpha')
        x = tf.Variable(tf.ones([3, 3], tf.float32), name='x')
        y = tf.Variable(tf.ones([3, 3], tf.float32), name='y')

        result = contrib_tpu.rewrite(_computation_fn, [alpha, x, y])

        with tf.Session('grpc://{0}:8470'.format(self.tpu_ip)) as sess:
            sess.run(contrib_tpu.initialize_system())
            sess.run(tf.global_variables_initializer())
            logging.info(sess.run(result))
            sess.run(tpu.shutdown_system())
            logging.info('Output should be a 3x3 matrix with all 4s.')
        self.tpu_computation = 'Passed'
        logging.info('Successfully ran a computation on the TPU')
Example #10
0
  def run_on_device(self, model_fn, model_inputs, device):
    """Runs `model_fn` on the given device.

    Raises an exception if no such device is available.  `model_fn` should
    return one or more tensors as a list or tuple.

    Args:
      model_fn: Function returning one or more tensors.
      model_inputs: An iterable of Numpy arrays or scalars.
                    These will be passed as arguments to `model_fn`.
      device: Device to run on.  One of ("tpu", "gpu", "cpu").

    Returns:
      Output from the model function.
    """

    def _make_placeholders():
      return dict([(gen_array_ops.placeholder_with_default(v, v.shape), v)
                   for v in model_inputs])

    if device == "tpu":
      with self.test_session(graph=ops.Graph()) as sess:
        placeholders = _make_placeholders()
        tpu_computation = tpu.rewrite(model_fn, placeholders.keys())
        sess.run(tpu.initialize_system())
        sess.run(variables.global_variables_initializer())
        result = sess.run(tpu_computation, placeholders)
        sess.run(tpu.shutdown_system())
        # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite().
        if len(result) == 1:
          return result[0]
        return result
    elif device == "gpu":
      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
        placeholders = _make_placeholders()
        sess.run(variables.global_variables_initializer())
        return sess.run(model_fn(placeholders.keys()), placeholders)
    elif device == "cpu":
      # TODO(power) -- will this interact poorly with cached GPU sessions?
      with self.test_session(graph=ops.Graph(), use_gpu=False) as sess:
        placeholders = _make_placeholders()
        sess.run(variables.global_variables_initializer())
        return sess.run(model_fn(placeholders.keys()), placeholders)
Example #11
0
  def shutdown(self):
    logging.info('Shutting down TPU session.')
    with self.tpu_session() as session:
      session.run(tpu.shutdown_system())

    self._session.close()
Example #12
0
  def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
                    use_callable_loss, is_tpu):
    with distribution.scope():
      all_vars = []

      def model_fn(x, y):

        def loss_fn():
          # Use fixed initialization to make the steps deterministic.
          w = variable_scope.get_variable("w", initializer=[[2.]])
          all_vars.append(w)
          predict = math_ops.matmul(x, w)
          return losses_impl.mean_squared_error(
              y, predict, reduction=loss_reduction)

        optimizer = optimizer_fn()  # GradientDescent with 0.2 learning rate

        if use_callable_loss:
          return optimizer.minimize(loss_fn)
        else:
          return optimizer.minimize(loss_fn())

      def dataset_fn():
        features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
        labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
        return dataset_ops.Dataset.zip((features, labels)).repeat()

      iterator = distribution.distribute_dataset(
          dataset_fn).make_one_shot_iterator()

      def run_step():
        return distribution.group(
            distribution.call_for_each_tower(
                model_fn, *iterator.get_next(), run_concurrently=False))

      if not context.executing_eagerly():
        with self.test_session() as sess:
          if is_tpu:
            sess.run(tpu.initialize_system())
          run_step = sess.make_callable(run_step())
        self.evaluate(variables_lib.global_variables_initializer())

      run_step()

      v = all_vars[0]
      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
      weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
      # Our model is:
      #   predict = x * w
      #   loss = (predict - y)^2
      #   dloss/dpredict = 2*(predict - y)
      #   dloss/dw = 2 * x^T @ (predict - y)
      # For our batch size of 2, assuming sum loss reduction:
      #   x = [2, 7]
      #   y = [6, 21]
      #   w_initial = 2
      #   predict = [4, 14]
      #   predict - y = [-2, -7]
      #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
      # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
      # with sum loss reduction, or 10.6 with mean.
      if loss_reduction == losses_impl.Reduction.SUM:
        # Note that the "distribution.num_towers" factor will go away once
        # we split the input across towers, instead of pulling a complete
        # batch of input per tower.
        self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001)
      else:
        # One of the mean loss reductions.
        self.assertNear(weight, 2 + 10.6, 0.0001)

      if is_tpu:
        with self.test_session() as sess:
          sess.run(tpu.shutdown_system())
Example #13
0
 def _check():
   with session.Session() as sess:
     sess.run(tpu.initialize_system())
     sess.run(tpu.shutdown_system())
Example #14
0
 def finalize(self):
   if context.executing_eagerly():
     # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
     raise NotImplementedError('Eager mode not supported in TPUStrategy.')
   else:
     return [tpu.shutdown_system()]
  def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
    created_variables = []
    trainable_variables = []

    def appending_creator(next_creator, *args, **kwargs):
      v = next_creator(*args, **kwargs)
      created_variables.append(v.name)
      if "trainable" in kwargs and kwargs["trainable"]:
        trainable_variables.append(v.name)
      return v

    # Creator scope needs to be set before it's used inside
    # `distribution.scope`.
    with variable_scope.variable_creator_scope(
        appending_creator), distribution.scope():
      model_fn, dataset_fn, layer = minimize_loss_example(
          optimizer_fn,
          use_bias=True,
          use_callable_loss=True,
          create_optimizer_inside_model_fn=True)

      iterator = distribution.distribute_dataset(
          dataset_fn).make_one_shot_iterator()

      def run_step():
        return distribution.group(
            distribution.call_for_each_tower(
                model_fn, iterator.get_next(), run_concurrently=layer.built))

      if not context.executing_eagerly():
        with self.test_session() as sess:
          if is_tpu:
            sess.run(tpu.initialize_system())
          run_step = sess.make_callable(run_step())
        self.evaluate(variables_lib.global_variables_initializer())

      run_step()

      if is_tpu:
        with self.test_session() as sess:
          sess.run(tpu.shutdown_system())

      def get_expected_variables(optimizer_fn, num_parameter_devices):
        variables_map = {
            "GradientDescent": ["dense/kernel", "dense/bias"],
            "Adam": [
                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
                "dense/bias/Adam_1"
            ]
        }
        variables = variables_map[optimizer_fn().get_name()]
        variables.extend([
            v + "/replica_{}".format(replica)
            for v in variables
            for replica in range(1, num_parameter_devices)
        ])
        return set([v + ":0" for v in variables])

      self.assertEqual(
          get_expected_variables(optimizer_fn,
                                 len(distribution.parameter_devices)),
          set(created_variables))
Example #16
0
 def _check():
   with tf_session.Session() as sess:
     sess.run(tpu.initialize_system())
     sess.run(tpu.shutdown_system())
Example #17
0
    def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
                      use_callable_loss, is_tpu):
        with distribution.scope():
            all_vars = []

            def model_fn(x, y):
                def loss_fn():
                    # Use fixed initialization to make the steps deterministic.
                    w = variable_scope.get_variable("w", initializer=[[2.]])
                    all_vars.append(w)
                    predict = math_ops.matmul(x, w)
                    return losses_impl.mean_squared_error(
                        y, predict, reduction=loss_reduction)

                optimizer = optimizer_fn(
                )  # GradientDescent with 0.2 learning rate

                if use_callable_loss:
                    return optimizer.minimize(loss_fn)
                else:
                    return optimizer.minimize(loss_fn())

            def dataset_fn():
                features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
                labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
                return dataset_ops.Dataset.zip((features, labels)).repeat()

            iterator = distribution.distribute_dataset(
                dataset_fn).make_one_shot_iterator()

            def run_step():
                return distribution.group(
                    distribution.call_for_each_tower(model_fn,
                                                     *iterator.get_next(),
                                                     run_concurrently=False))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    if is_tpu:
                        sess.run(tpu.initialize_system())
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            v = all_vars[0]
            self.assertTrue(all([v is vi for vi in all_vars[1:]]))
            weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
            # Our model is:
            #   predict = x * w
            #   loss = (predict - y)^2
            #   dloss/dpredict = 2*(predict - y)
            #   dloss/dw = 2 * x^T @ (predict - y)
            # For our batch size of 2, assuming sum loss reduction:
            #   x = [2, 7]
            #   y = [6, 21]
            #   w_initial = 2
            #   predict = [4, 14]
            #   predict - y = [-2, -7]
            #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
            # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
            # with sum loss reduction, or 10.6 with mean.
            if loss_reduction == losses_impl.Reduction.SUM:
                # Note that the "distribution.num_towers" factor will go away once
                # we split the input across towers, instead of pulling a complete
                # batch of input per tower.
                self.assertNear(weight, 2 + 21.2 * distribution.num_towers,
                                0.0001)
            else:
                # One of the mean loss reductions.
                self.assertNear(weight, 2 + 10.6, 0.0001)

            if is_tpu:
                with self.test_session() as sess:
                    sess.run(tpu.shutdown_system())
Example #18
0
    def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu):
        created_variables = []
        trainable_variables = []

        def appending_creator(next_creator, *args, **kwargs):
            v = next_creator(*args, **kwargs)
            created_variables.append(v.name)
            if "trainable" in kwargs and kwargs["trainable"]:
                trainable_variables.append(v.name)
            return v

        # Creator scope needs to be set before it's used inside
        # `distribution.scope`.
        with variable_scope.variable_creator_scope(
                appending_creator), distribution.scope():
            model_fn, dataset_fn, layer = minimize_loss_example(
                optimizer_fn,
                use_bias=True,
                use_callable_loss=True,
                create_optimizer_inside_model_fn=True)

            iterator = distribution.distribute_dataset(
                dataset_fn).make_one_shot_iterator()

            def run_step():
                return distribution.group(
                    distribution.call_for_each_tower(
                        model_fn,
                        iterator.get_next(),
                        run_concurrently=layer.built))

            if not context.executing_eagerly():
                with self.test_session() as sess:
                    if is_tpu:
                        sess.run(tpu.initialize_system())
                    run_step = sess.make_callable(run_step())
                self.evaluate(variables_lib.global_variables_initializer())

            run_step()

            if is_tpu:
                with self.test_session() as sess:
                    sess.run(tpu.shutdown_system())

            def get_expected_variables(optimizer_fn, num_parameter_devices):
                variables_map = {
                    "GradientDescent": ["dense/kernel", "dense/bias"],
                    "Adam": [
                        "dense/kernel", "dense/bias", "beta1_power",
                        "beta2_power", "dense/kernel/Adam",
                        "dense/kernel/Adam_1", "dense/bias/Adam",
                        "dense/bias/Adam_1"
                    ]
                }
                variables = variables_map[optimizer_fn().get_name()]
                variables.extend([
                    v + "/replica_{}".format(replica) for v in variables
                    for replica in range(1, num_parameter_devices)
                ])
                return set([v + ":0" for v in variables])

            self.assertEqual(
                get_expected_variables(optimizer_fn,
                                       len(distribution.parameter_devices)),
                set(created_variables))
Example #19
0
 def get_finalize_ops(self):
     return [tpu.shutdown_system()]
Example #20
0
    def shutdown(self):
        logging.info('Shutting down TPU session.')
        with self.tpu_session() as session:
            session.run(tpu.shutdown_system())

        self._session.close()
Example #21
0
 def begin(self):
   self._enqueue_ops = self._enqueue_fn()
   logging.info('TPU job name %s', self._tpu_job)
   self._init_op = [tpu.initialize_system(job=self._tpu_job)]
   self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]
Example #22
0
 def begin(self):
     self._enqueue_ops = self._enqueue_fn()
     logging.info('TPU job name %s', self._tpu_job)
     self._init_op = [tpu.initialize_system(job=self._tpu_job)]
     self._finalize_op = [tpu.shutdown_system(job=self._tpu_job)]