def test_times_variable_arguments(self):
   c1 = combinations.combine(mode=["graph", "eager"])
   c2 = combinations.combine(optimizer=["adam", "gd"])
   c3 = combinations.combine(distribution=["d1", "d2"])
   c4 = combinations.times(c3, c1, c2)
   self.assertEqual([
       OrderedDict([("distribution", "d1"), ("mode", "graph"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d1"), ("mode", "graph"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d1"), ("mode", "eager"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d1"), ("mode", "eager"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d2"), ("mode", "graph"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d2"), ("mode", "graph"),
                    ("optimizer", "gd")]),
       OrderedDict([("distribution", "d2"), ("mode", "eager"),
                    ("optimizer", "adam")]),
       OrderedDict([("distribution", "d2"), ("mode", "eager"),
                    ("optimizer", "gd")])
   ], c4)
   self.assertEqual(
       combinations.combine(
           mode=["graph", "eager"],
           optimizer=["adam", "gd"],
           distribution=["d1", "d2"]), c4)
 def test_add(self):
   self.assertEqual(
       [{
           "a": 1
       }, {
           "a": 2
       }, {
           "b": 2
       }, {
           "b": 3
       }],
       combinations.combine(a=[1, 2]) + combinations.combine(b=[2, 3]))
def strategy_and_input_combinations():
  return (
      combinations.times(
          combinations.combine(distribution=strategies_minus_tpu),
          combinations.combine(mode=['graph'],
                               use_numpy=[True, False],
                               use_validation_data=[True, False])
          + combinations.combine(mode=['eager'],
                                 use_numpy=[False],
                                 use_validation_data=[False])) +
      combinations.times(
          combinations.combine(distribution=tpu_strategies),
          combinations.combine(mode=['graph'],
                               use_numpy=[True, False],
                               use_validation_data=[True, False])))
Esempio n. 4
0
def tpu_combinations():
  return combinations.combine(
      distribution=[
          strategy_combinations.tpu_strategy_one_step,
          strategy_combinations.tpu_strategy
      ],
      mode=["graph"])
 def test_arguments_sorted(self):
   self.assertEqual([
       OrderedDict([("aa", 1), ("ab", 2)]),
       OrderedDict([("aa", 1), ("ab", 3)]),
       OrderedDict([("aa", 2), ("ab", 2)]),
       OrderedDict([("aa", 2), ("ab", 3)])
   ], combinations.combine(ab=[2, 3], aa=[1, 2]))
def test_combinations_for_embedding_model():
  return (
      combinations.times(
          combinations.combine(distribution=
                               strategies_for_embedding_models()),
          (graph_mode_test_configuration() +
           eager_mode_test_configuration())))
def all_strategy_minus_default_and_tpu_combinations():
  return combinations.combine(
      distribution=[
          one_device_strategy, one_device_strategy_gpu,
          mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus
      ],
      mode=["graph", "eager"])
def test_combinations_for_stateful_embedding_model():
  return (
      combinations.combine(
          distribution=strategies_for_stateful_embedding_model(),
          mode='graph',
          use_numpy=False,
          use_validation_data=False
      ))
 def test_combine_single_parameter(self):
   self.assertEqual([{
       "a": 1,
       "b": 2
   }, {
       "a": 2,
       "b": 2
   }], combinations.combine(a=[1, 2], b=2))
Esempio n. 10
0
def distributions_and_v1_optimizers():
  """A common set of combination with DistributionStrategies and Optimizers."""
  return combinations.combine(
      distribution=[
          one_device_strategy,
          mirrored_strategy_with_gpu_and_cpu,
          mirrored_strategy_with_two_gpus,
      ],
      optimizer_fn=optimizers_v1)
Esempio n. 11
0
def all_combinations():
  return combinations.combine(
      distribution=[
          strategy_combinations.default_strategy,
          strategy_combinations.one_device_strategy,
          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          strategy_combinations.mirrored_strategy_with_two_gpus,
      ],
      mode=["graph"])
Esempio n. 12
0
def distributions_and_v2_optimizers():
  """DistributionStrategies and V2 Optimizers."""
  return combinations.combine(
      distribution=[
          strategy_combinations.one_device_strategy,
          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          strategy_combinations.mirrored_strategy_with_two_gpus,
      ],
      optimizer_fn=optimizers_v2)
def test_combinations_with_tpu_strategies():
  tpu_strategies = [
      strategy_combinations.tpu_strategy,
      strategy_combinations.tpu_strategy_one_step
  ]

  return (
      combinations.times(
          combinations.combine(distribution=tpu_strategies),
          graph_mode_test_configuration()))
Esempio n. 14
0
 def test_times(self):
   c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
   c2 = combinations.combine(mode=["eager"], loss=["callable"])
   c3 = combinations.combine(distribution=["d1", "d2"])
   c4 = combinations.times(c3, c1 + c2)
   self.assertEqual([
       OrderedDict([("distribution", "d1"), ("loss", "callable"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d1"), ("loss", "tensor"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d1"), ("loss", "callable"),
                    ("mode", "eager")]),
       OrderedDict([("distribution", "d2"), ("loss", "callable"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d2"), ("loss", "tensor"),
                    ("mode", "graph")]),
       OrderedDict([("distribution", "d2"), ("loss", "callable"),
                    ("mode", "eager")])
   ], c4)
def generate_callback_test_function(custom_callable):
  """Generic template for callback tests using mnist synthetic dataset."""

  @combinations.generate(
      combinations.combine(
          mode=['graph'],
          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
          required_gpus=[0, 1]))
  def test_template(self, strategy_cls):
    num_workers = 2
    num_epoch = 2

    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
    self._barrier = dc._Barrier(2)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside of a thread."""
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):
        strategy = get_strategy_object(strategy_cls)
        batch_size = 64
        steps = 2
        train_ds, _ = _mnist_synthetic_dataset(batch_size, steps)
        with strategy.scope():
          model = _get_model((28, 28, 1))

        custom_callable(
            model,
            self,
            train_ds,
            num_epoch,
            steps,
            strategy,
            saving_filepath=kwargs['saving_filepath'])

    # Pass saving_filepath from the parent thread to ensure every worker has the
    # same fileapth to save.
    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint.h5')
    threads = self.run_multiple_tasks_in_threads(
        _independent_worker_fn, cluster_spec, saving_filepath=saving_filepath)
    if os.path.exists(saving_filepath):
      os.remove(saving_filepath)

    threads_to_join = []
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)

  return test_template
def strategy_and_optimizer_combinations():
  return combinations.times(
      all_strategy_combinations(),
      combinations.combine(optimizer=[
          strategy_combinations.adagrad_optimizer_v1_fn,
          strategy_combinations.adagrad_optimizer_keras_v2_fn,
          strategy_combinations.adam_optimizer_v1_fn,
          strategy_combinations.adam_optimizer_keras_v2_fn,
          strategy_combinations.gradient_descent_optimizer_v1_fn,
          strategy_combinations.gradient_descent_optimizer_keras_v2_fn,
          strategy_combinations.rmsprop_optimizer_v1_fn,
          strategy_combinations.rmsprop_optimizer_keras_v2_fn
      ]))
Esempio n. 17
0
 def test_combine(self):
   self.assertEqual([{
       "a": 1,
       "b": 2
   }, {
       "a": 1,
       "b": 3
   }, {
       "a": 2,
       "b": 2
   }, {
       "a": 2,
       "b": 3
   }], combinations.combine(a=[1, 2], b=[2, 3]))
def all_strategy_and_eager_plus_graph():
  return combinations.times(
      combinations.combine(distribution=contrib_mirrored_strategies),
      combinations.combine(mode=["eager", "graph"]))
    with self.name_scope:
      self._layers = [
          keras.layers.Dense(4, name="dense"),
      ]

  @module.Module.with_name_scope
  def __call__(self, x):
    for layer in self._layers:
      x = layer(x)
    return x


@combinations.generate(
    combinations.combine(
        distribution=(strategy_combinations.all_strategies +
                      strategy_combinations.multiworker_strategies),
        mode=["eager"]
        )
    )
class KerasModelsTest(test.TestCase, parameterized.TestCase):

  def test_single_keras_layer_run(self, distribution):
    dataset = _get_dataset()
    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))

    with distribution.scope():
      model = keras.layers.Dense(4, name="dense")

    @def_function.function
    def train_step(iterator):
      def step_fn(inputs):
def all_strategy_combinations_with_graph_mode():
  return combinations.combine(distribution=keras_correctness_test_base.
                              all_strategies, mode=['graph'])
Esempio n. 21
0
def graph_mode_test_configuration():
    return combinations.combine(mode='graph',
                                use_numpy=[True, False],
                                use_validation_data=[True, False])
def graph_mode_test_configuration():
  return combinations.combine(mode='graph',
                              use_numpy=[True, False],
                              use_validation_data=[True, False])
Esempio n. 23
0
class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
    def _assign_replica_local(self, v, new):
        for var, n in zip(v, new):
            with ops.device(var.device):
                self.evaluate(var.assign(n))

    def _save_return_saver(self, sess, var):
        saver = saver_lib.Saver(var_list=[var])
        test_dir = self.get_temp_dir()
        prefix = os.path.join(test_dir, "ckpt")
        return saver.save(sess, prefix), saver

    def _save(self, sess, var):
        save_path, _ = self._save_return_saver(sess, var)
        return save_path

    config = config_pb2.ConfigProto()
    config.allow_soft_placement = True

    @test_util.run_in_graph_and_eager_modes(config=config)
    def testProperties(self):
        if context.num_gpus() < 1 and context.executing_eagerly():
            self.skipTest(
                "A GPU is not available for this test in eager mode.")
        v, replica_local = _make_replica_local(
            variable_scope.VariableAggregation.SUM)

        self.assertEqual(v[0].constraint, replica_local.constraint)
        self.assertEqual(v[0].name, replica_local.name)
        self.assertEqual(v[0].dtype, replica_local.dtype)
        self.assertEqual(v[0].shape, replica_local.shape)
        self.assertEqual(variable_scope.VariableAggregation.SUM,
                         replica_local.aggregation)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu
        ],
                             mode=["eager"]))
    def testCanPassToDefFun(self, distribution):
        @def_function.function
        def add1(x):
            return x + 1.

        with distribution.scope():
            v = variables_lib.Variable(
                1.,
                aggregation=variables_lib.VariableAggregation.MEAN,
                synchronization=variables_lib.VariableSynchronization.ON_READ)

        self.assertEqual(2., self.evaluate(add1(v)))

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testTensorConversion(self, distribution):
        with context.graph_mode():
            _, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.SUM, distribution)
            converted = ops.convert_to_tensor(replica_local, as_ref=False)
            self.assertIsInstance(converted, ops.Tensor)
            self.assertEqual(converted.dtype, replica_local.dtype)

            converted = ops.convert_to_tensor(replica_local, as_ref=True)
            # Resources variable are converted to tensors as well when as_ref is True.
            self.assertIsInstance(converted, ops.Tensor)
            self.assertEqual(converted.dtype, replica_local.dtype)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.
            mirrored_strategy_with_two_gpus_no_merge_call,
            strategy_combinations.tpu_strategy,
            strategy_combinations.tpu_strategy_packed_var,
        ],
                             mode=["eager"]))
    def testValueInCrossReplicaContext(self, distribution):
        value_list, replica_local = _make_replica_local(
            variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
            distribution)

        self.assertIsInstance(replica_local.value(), ops.Tensor)
        self.assertEqual(self.evaluate(replica_local.value()),
                         self.evaluate(value_list[0].value()))

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.tpu_strategy_packed_var,
        ],
                             mode=["eager"]))
    def testValueInDefaultReplicaContext(self, distribution):
        with distribution.scope():
            v1 = variables_lib.Variable(
                0.0,
                aggregation=variables_lib.VariableAggregation.SUM,
                synchronization=variables_lib.VariableSynchronization.ON_READ)
            v2 = variables_lib.Variable(
                0.0,
                aggregation=variables_lib.VariableAggregation.SUM,
                synchronization=variables_lib.VariableSynchronization.ON_READ)

        @def_function.function
        def replica_fn():
            v1.assign_add(1.0)
            v2.assign_add(2.0)

        distribution.run(replica_fn)
        sum_v = v1 + v2
        self.assertEqual(sum_v, 6.0)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
        with self.cached_session() as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.SUM, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [3., 4.])

            with distribution.scope():
                # Saves the current value of v[0] + v[1], 7.
                save_path, saver = self._save_return_saver(sess, replica_local)

                # Change the values between save and restore.
                self._assign_replica_local(v, [5., 6.])

                # Restores the saved value of 7. which gets divided equally
                # between the variables.
                saver.restore(sess, save_path)
                self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution):
        if context.num_gpus() < 1 and context.executing_eagerly():
            self.skipTest(
                "A GPU is not available for this test in eager mode.")

        with self.cached_session() as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.MEAN, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [3., 4.])

            with distribution.scope():
                # Saves the current value of (v[0] + v[1])/2, 3.5.
                save_path, saver = self._save_return_saver(sess, replica_local)

                # Change the values between save and restore.
                self._assign_replica_local(v, [5., 6.])

                # Restores the saved value of 3.5 to both variables.
                saver.restore(sess, save_path)
                self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))

    def _save_replica_local_mean(self, distribution):
        """Save variables with mirroring, returns save_path."""
        with self.session(graph=ops.Graph()) as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.MEAN, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [3., 4.])

            with distribution.scope():
                # Saves the current value of (v[0] + v[1])/2, 3.5
                save_path = self._save(sess, replica_local)

                # Change the values between save and restore.
                self._assign_replica_local(v, [5., 6.])
        return save_path

    def _save_replica_local_sum(self, distribution):
        """Save variables with mirroring, returns save_path."""
        with self.session(graph=ops.Graph()) as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.SUM, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [1.5, 2.])

            with distribution.scope():
                # Saves the current value of v[0] + v[1], 3.5
                save_path = self._save(sess, replica_local)

                # Change the values between save and restore.
                self._assign_replica_local(v, [5., 6.])
        return save_path

    def _save_normal(self):
        """Save variables without mirroring, returns save_path."""
        with self.session(graph=ops.Graph()) as sess:
            var = variable_scope.get_variable(name="v",
                                              initializer=1.,
                                              use_resource=True)

            # Overwrite the initial value.
            self.evaluate(var.assign(3.5))

            # Saves the current value of var, 3.5.
            save_path = self._save(sess, var)

            # Change the values between save and restore.
            self.evaluate(var.assign(5.))
        return save_path

    def _restore_normal(self, save_path):
        """Restore to variables without mirroring in a fresh graph."""
        with self.session(graph=ops.Graph()) as sess:
            var = variable_scope.get_variable(name="v",
                                              initializer=7.,
                                              use_resource=True)

            # Overwrite the initial value.
            self.evaluate(var.assign(8.))

            # Restores the saved value of 3.5 to `var`.
            saver = saver_lib.Saver(var_list=[var])
            saver.restore(sess, save_path)
            self.assertEqual(3.5, self.evaluate(var))

    def _restore_replica_local_mean(self, save_path, distribution):
        """Restore to variables with mirroring in a fresh graph."""
        with self.session(graph=ops.Graph()) as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.MEAN, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [7., 8.])

            with distribution.scope():
                # Restores the saved value of 3.5 to both variables.
                saver = saver_lib.Saver(var_list=[replica_local])
                saver.restore(sess, save_path)
                self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))

    def _restore_replica_local_sum(self, save_path, distribution):
        """Restore to variables with mirroring in a fresh graph."""
        with self.session(graph=ops.Graph()) as sess:
            v, replica_local = _make_replica_local(
                variable_scope.VariableAggregation.SUM, distribution)

            # Overwrite the initial values.
            self._assign_replica_local(v, [7., 8.])

            with distribution.scope():
                # Restores the saved value of 3.5 to both variables.
                saver = saver_lib.Saver(var_list=[replica_local])
                saver.restore(sess, save_path)
                self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution):
        save_path = self._save_replica_local_mean(distribution)
        self._restore_replica_local_mean(save_path, distribution)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution):
        save_path = self._save_replica_local_sum(distribution)
        self._restore_replica_local_sum(save_path, distribution)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveReplicaLocalMeanRestoreNormal(self, distribution):
        save_path = self._save_replica_local_mean(distribution)
        self._restore_normal(save_path)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveReplicaLocalSumRestoreNormal(self, distribution):
        save_path = self._save_replica_local_sum(distribution)
        self._restore_normal(save_path)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveNormalRestoreReplicaLocalMean(self, distribution):
        save_path = self._save_normal()
        self._restore_replica_local_mean(save_path, distribution)

    @combinations.generate(mirrored_and_tpu_strategy_combinations())
    def testSaveNormalRestoreReplicaLocalSum(self, distribution):
        save_path = self._save_normal()
        self._restore_replica_local_sum(save_path, distribution)
Esempio n. 24
0
class RetinaNetTest(parameterized.TestCase, tf.test.TestCase):

  @parameterized.parameters(
      {
          'use_separable_conv': True,
          'build_anchor_boxes': True,
          'is_training': False,
          'has_att_heads': False
      },
      {
          'use_separable_conv': False,
          'build_anchor_boxes': True,
          'is_training': False,
          'has_att_heads': False
      },
      {
          'use_separable_conv': False,
          'build_anchor_boxes': False,
          'is_training': False,
          'has_att_heads': False
      },
      {
          'use_separable_conv': False,
          'build_anchor_boxes': False,
          'is_training': True,
          'has_att_heads': False
      },
      {
          'use_separable_conv': False,
          'build_anchor_boxes': True,
          'is_training': True,
          'has_att_heads': True
      },
      {
          'use_separable_conv': False,
          'build_anchor_boxes': True,
          'is_training': False,
          'has_att_heads': True
      },
  )
  def test_build_model(self, use_separable_conv, build_anchor_boxes,
                       is_training, has_att_heads):
    num_classes = 3
    min_level = 3
    max_level = 7
    num_scales = 3
    aspect_ratios = [1.0]
    anchor_size = 3
    fpn_num_filters = 256
    head_num_convs = 4
    head_num_filters = 256
    num_anchors_per_location = num_scales * len(aspect_ratios)
    image_size = 384
    images = np.random.rand(2, image_size, image_size, 3)
    image_shape = np.array([[image_size, image_size], [image_size, image_size]])

    if build_anchor_boxes:
      anchor_boxes = anchor.Anchor(
          min_level=min_level,
          max_level=max_level,
          num_scales=num_scales,
          aspect_ratios=aspect_ratios,
          anchor_size=anchor_size,
          image_size=(image_size, image_size)).multilevel_boxes
      for l in anchor_boxes:
        anchor_boxes[l] = tf.tile(
            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])
    else:
      anchor_boxes = None

    if has_att_heads:
      attribute_heads = [dict(name='depth', type='regression', size=1)]
    else:
      attribute_heads = None

    backbone = resnet.ResNet(model_id=50)
    decoder = fpn.FPN(
        input_specs=backbone.output_specs,
        min_level=min_level,
        max_level=max_level,
        num_filters=fpn_num_filters,
        use_separable_conv=use_separable_conv)
    head = dense_prediction_heads.RetinaNetHead(
        min_level=min_level,
        max_level=max_level,
        num_classes=num_classes,
        attribute_heads=attribute_heads,
        num_anchors_per_location=num_anchors_per_location,
        use_separable_conv=use_separable_conv,
        num_convs=head_num_convs,
        num_filters=head_num_filters)
    generator = detection_generator.MultilevelDetectionGenerator(
        max_num_detections=10)
    model = retinanet_model.RetinaNetModel(
        backbone=backbone,
        decoder=decoder,
        head=head,
        detection_generator=generator,
        min_level=min_level,
        max_level=max_level,
        num_scales=num_scales,
        aspect_ratios=aspect_ratios,
        anchor_size=anchor_size)

    _ = model(images, image_shape, anchor_boxes, training=is_training)

  @combinations.generate(
      combinations.combine(
          strategy=[
              strategy_combinations.cloud_tpu_strategy,
              strategy_combinations.one_device_strategy_gpu,
          ],
          image_size=[
              (128, 128),
          ],
          training=[True, False],
          has_att_heads=[True, False],
          output_intermediate_features=[True, False],
      ))
  def test_forward(self, strategy, image_size, training, has_att_heads,
                   output_intermediate_features):
    """Test for creation of a R50-FPN RetinaNet."""
    tf.keras.backend.set_image_data_format('channels_last')
    num_classes = 3
    min_level = 3
    max_level = 7
    num_scales = 3
    aspect_ratios = [1.0]
    num_anchors_per_location = num_scales * len(aspect_ratios)

    images = np.random.rand(2, image_size[0], image_size[1], 3)
    image_shape = np.array(
        [[image_size[0], image_size[1]], [image_size[0], image_size[1]]])

    with strategy.scope():
      anchor_gen = anchor.build_anchor_generator(
          min_level=min_level,
          max_level=max_level,
          num_scales=num_scales,
          aspect_ratios=aspect_ratios,
          anchor_size=3)
      anchor_boxes = anchor_gen(image_size)
      for l in anchor_boxes:
        anchor_boxes[l] = tf.tile(
            tf.expand_dims(anchor_boxes[l], axis=0), [2, 1, 1, 1])

      backbone = resnet.ResNet(model_id=50)
      decoder = fpn.FPN(
          input_specs=backbone.output_specs,
          min_level=min_level,
          max_level=max_level)

      if has_att_heads:
        attribute_heads = [dict(name='depth', type='regression', size=1)]
      else:
        attribute_heads = None
      head = dense_prediction_heads.RetinaNetHead(
          min_level=min_level,
          max_level=max_level,
          num_classes=num_classes,
          attribute_heads=attribute_heads,
          num_anchors_per_location=num_anchors_per_location)
      generator = detection_generator.MultilevelDetectionGenerator(
          max_num_detections=10, nms_version='v1')
      model = retinanet_model.RetinaNetModel(
          backbone=backbone,
          decoder=decoder,
          head=head,
          detection_generator=generator)

      model_outputs = model(
          images,
          image_shape,
          anchor_boxes,
          output_intermediate_features=output_intermediate_features,
          training=training)

    if training:
      cls_outputs = model_outputs['cls_outputs']
      box_outputs = model_outputs['box_outputs']
      for level in range(min_level, max_level + 1):
        self.assertIn(str(level), cls_outputs)
        self.assertIn(str(level), box_outputs)
        self.assertAllEqual([
            2,
            image_size[0] // 2**level,
            image_size[1] // 2**level,
            num_classes * num_anchors_per_location
        ], cls_outputs[str(level)].numpy().shape)
        self.assertAllEqual([
            2,
            image_size[0] // 2**level,
            image_size[1] // 2**level,
            4 * num_anchors_per_location
        ], box_outputs[str(level)].numpy().shape)
        if has_att_heads:
          att_outputs = model_outputs['attribute_outputs']
          for att in att_outputs.values():
            self.assertAllEqual([
                2, image_size[0] // 2**level, image_size[1] // 2**level,
                1 * num_anchors_per_location
            ], att[str(level)].numpy().shape)
    else:
      self.assertIn('detection_boxes', model_outputs)
      self.assertIn('detection_scores', model_outputs)
      self.assertIn('detection_classes', model_outputs)
      self.assertIn('num_detections', model_outputs)
      self.assertAllEqual(
          [2, 10, 4], model_outputs['detection_boxes'].numpy().shape)
      self.assertAllEqual(
          [2, 10], model_outputs['detection_scores'].numpy().shape)
      self.assertAllEqual(
          [2, 10], model_outputs['detection_classes'].numpy().shape)
      self.assertAllEqual(
          [2,], model_outputs['num_detections'].numpy().shape)
      if has_att_heads:
        self.assertIn('detection_attributes', model_outputs)
        self.assertAllEqual(
            [2, 10, 1],
            model_outputs['detection_attributes']['depth'].numpy().shape)
    if output_intermediate_features:
      for l in range(2, 6):
        self.assertIn('backbone_{}'.format(l), model_outputs)
        self.assertAllEqual([
            2, image_size[0] // 2**l, image_size[1] // 2**l,
            backbone.output_specs[str(l)].as_list()[-1]
        ], model_outputs['backbone_{}'.format(l)].numpy().shape)
      for l in range(min_level, max_level + 1):
        self.assertIn('decoder_{}'.format(l), model_outputs)
        self.assertAllEqual([
            2, image_size[0] // 2**l, image_size[1] // 2**l,
            decoder.output_specs[str(l)].as_list()[-1]
        ], model_outputs['decoder_{}'.format(l)].numpy().shape)

  def test_serialize_deserialize(self):
    """Validate the network can be serialized and deserialized."""
    num_classes = 3
    min_level = 3
    max_level = 7
    num_scales = 3
    aspect_ratios = [1.0]
    num_anchors_per_location = num_scales * len(aspect_ratios)

    backbone = resnet.ResNet(model_id=50)
    decoder = fpn.FPN(
        input_specs=backbone.output_specs,
        min_level=min_level,
        max_level=max_level)
    head = dense_prediction_heads.RetinaNetHead(
        min_level=min_level,
        max_level=max_level,
        num_classes=num_classes,
        num_anchors_per_location=num_anchors_per_location)
    generator = detection_generator.MultilevelDetectionGenerator(
        max_num_detections=10)
    model = retinanet_model.RetinaNetModel(
        backbone=backbone,
        decoder=decoder,
        head=head,
        detection_generator=generator,
        min_level=min_level,
        max_level=max_level,
        num_scales=num_scales,
        aspect_ratios=aspect_ratios,
        anchor_size=3)

    config = model.get_config()
    new_model = retinanet_model.RetinaNetModel.from_config(config)

    # Validate that the config can be forced to JSON.
    _ = new_model.to_json()

    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(model.get_config(), new_model.get_config())
Esempio n. 25
0
def all_strategy_combinations():
    return combinations.combine(distribution=[
        strategy_combinations.default_strategy,
        strategy_combinations.cloud_tpu_strategy,
        strategy_combinations.one_device_strategy_gpu,
    ], )
Esempio n. 26
0
class OpCancellationTest(test.TestCase, parameterized.TestCase):
    def setUp(self):
        _setup_context()
        super().setUp()

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortIfNoCollective(self, collective_op, device,
                                          communication):
        # Do not abort if there's no active collective ops. There could be
        # exceptions like EOF which we expect users to catch, aborting collective
        # ops on all op errors intervenes with this workflow.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        dataset = dataset_ops.Dataset.from_tensors([1.])

        @def_function.function
        def collective_fn(in_tensor):
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        @def_function.function
        def f():
            iterator = iter(dataset)
            collective_fn(next(iterator))
            # This next(iterator) should raise EOF.
            collective_fn(next(iterator))

        with self.assertRaises(errors.OutOfRangeError):
            f()
        collective_fn(constant_op.constant([1.]))

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorAbortWithCollective(self, collective_op, device,
                                       communication):
        # Abort v1 collective ops if there're active collective ops at the time of
        # an op error. This is due to the inability to cancel collective ops, and op
        # errors may cause running collective ops to hang.
        dev0 = '/device:%s:0' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])
        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test abortion
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Now collective ops is aborted, subsequent collective ops should fail with
        # the previous error.
        with self.assertRaises(errors.CancelledError):
            with ops.device(dev0):
                collective_op(in_tensor,
                              group_size,
                              group_key,
                              instance_key,
                              communication_hint=communication)

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortWithCollective(self, collective_op, device,
                                          communication):
        # Do not abort v2 collective ops even if there're active collective ops at
        # the time of an op error. We rely cancellation to terminate active
        # collective ops.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])

        @def_function.function
        def collective_fn():
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        # Local params resolution cannot be cancelled yet, so we perform a normal
        # collective so that the group is resolved.
        collective_fn()

        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test cancellation
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Collective ops shouldn't be aborted and new collectives should be able to
        # proceed.
        collective_fn()

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testCancelDuringParamResolution(self, collective_op, device,
                                        communication):
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])
        t1_cancellation_manager = cancellation.CancellationManager()
        t2_cancellation_manager = cancellation.CancellationManager()

        @def_function.function
        def _collective_fn(x):
            # Run an assertion to crash one of the two function executions running
            # collectives. We explicitly cancel the other in response.
            assert_op = check_ops.assert_equal(x, in_tensor)
            with ops.control_dependencies([assert_op]):
                return collective_op(in_tensor,
                                     group_size,
                                     group_key,
                                     instance_key,
                                     communication_hint=communication)

        collective_concrete = _collective_fn.get_concrete_function(in_tensor)

        finish_mu = threading.Lock()
        finishes = 0

        def _placement_wrapper(device, x, my_cancellation, other_cancellation):
            try:
                with ops.device(device):
                    cancelable_collective = my_cancellation.get_cancelable_function(
                        collective_concrete)
                    return cancelable_collective(x)
            except errors.InvalidArgumentError:
                # `assert_equal` failed for this execution of the function. The other
                # function would deadlock without cancellation.
                other_cancellation.start_cancel()
            except errors.CancelledError:
                pass
            nonlocal finishes
            with finish_mu:
                finishes += 1

        t1 = threading.Thread(target=_placement_wrapper,
                              args=(dev0, constant_op.constant([1.]),
                                    t1_cancellation_manager,
                                    t2_cancellation_manager))
        t2 = threading.Thread(
            target=_placement_wrapper,
            # Will cause the assertion to fail
            args=(dev1, constant_op.constant([2.]), t2_cancellation_manager,
                  t1_cancellation_manager))
        t1.start()
        t2.start()
        t1.join()
        t2.join()
        self.assertEqual(finishes, 2)
Esempio n. 27
0
class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):

  def testMergeCall(self):
    _assert_in_default_state(self)

    def merge_fn(dist, s):
      self.assertIs(ds_context._get_default_strategy(), dist)
      self.assertIs(None, ds_context.get_replica_context())
      self.assertIs(dist, ds_context.get_cross_replica_context())
      self.assertTrue(ds_context.in_cross_replica_context())
      self.assertIs(dist, ds_context.get_strategy())
      self.assertFalse(ds_context.has_strategy())
      return "foo_" + s

    replica_ctx = ds_context.get_replica_context()
    self.assertIs(ds_context._get_default_replica_context(), replica_ctx)
    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
    _assert_in_default_state(self)

  def testScopeMostlyNoOp(self):
    _assert_in_default_state(self)

    test_strategy = _TestStrategy2()
    with test_strategy.scope():
      variable_scope.variable(1.0, name="before")

    default_strategy = ds_context._get_default_strategy()
    scope = default_strategy.scope()
    with scope:
      _assert_in_default_state(self)

      with test_strategy.scope():
        with self.assertRaisesRegexp(
            RuntimeError, "Mixing different tf.distribute.Strategy objects"):
          variable_scope.variable(1.0, name="error")

      with scope:
        _assert_in_default_state(self)

        with test_strategy.scope():
          with self.assertRaisesRegexp(
              RuntimeError, "Mixing different tf.distribute.Strategy objects"):
            variable_scope.variable(1.0, name="also_error")

      _assert_in_default_state(self)

    _assert_in_default_state(self)
    with test_strategy.scope():
      variable_scope.variable(1.0, name="after")

  def testExperimentalRunV2(self):
    default_strategy = ds_context._get_default_strategy()
    dataset = dataset_ops.Dataset.range(10).batch(2)
    iterator = default_strategy.extended._make_dataset_iterator(dataset)
    next_val = iterator.get_next()

    def train_step(input_data):
      return input_data

    for _ in range(2):
      default_strategy.experimental_run_v2(train_step, args=(next_val,))

  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
  def testDistributedDatasets(self):
    default_strategy = ds_context._get_default_strategy()
    if context.executing_eagerly():
      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
      dist_dataset = default_strategy.experimental_distribute_dataset(
          dataset_fn(distribute_lib.InputContext()))
      next_val = next(iter(dist_dataset))
    else:
      dataset_fn = lambda _: dataset_ops.DatasetV1.range(10).batch(2)
      dist_dataset = default_strategy.experimental_distribute_dataset(
          dataset_fn(distribute_lib.InputContext()))
      iterator = dist_dataset.make_initializable_iterator()
      self.evaluate(iterator.initializer)
      next_val = iterator.get_next()
    self.assertAllEqual([0, 1], self.evaluate(next_val))

  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
  def testDistributedDatasetsFromFunction(self):
    default_strategy = ds_context._get_default_strategy()
    if context.executing_eagerly():
      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
      dist_dataset_from_func = \
          default_strategy.experimental_distribute_datasets_from_function(
              dataset_fn)
      next_val = next(iter(dist_dataset_from_func))
      self.assertAllEqual([0, 1], self.evaluate(next_val))
    else:
      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
      with self.assertRaisesRegexp(RuntimeError,
                                   "only supported when eager execution is "
                                   "enabled"):
        dist_dataset_from_func = \
          default_strategy.experimental_distribute_datasets_from_function(
              dataset_fn)
Esempio n. 28
0
                                                 instance_key, *args, **kwargs)

    @staticmethod
    def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
                       *args, **kwargs):
        group_size = array_ops.identity(group_size)
        group_key = array_ops.identity(group_key)
        instance_key = array_ops.identity(instance_key)
        shape = array_ops.identity(shape)
        return _collective_ops.broadcast_recv_v2(shape, dtype, group_size,
                                                 group_key, instance_key,
                                                 *args, **kwargs)


device_combination = (
    combinations.combine(device='CPU', communication='RING', required_gpus=0) +
    combinations.combine(
        device='GPU', communication=['RING', 'NCCL'], required_gpus=2))

collective_op_combinations = combinations.combine(collective_op=[
    combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce),
    combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce),
    combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather),
    combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather)
])


@combinations.generate(
    combinations.times(
        combinations.combine(collective_ops=[
            combinations.NamedObject('v1', CollectiveOpsV1),
Esempio n. 29
0
class ResNetTest(parameterized.TestCase, tf.test.TestCase):
    @parameterized.parameters(
        (128, 50, 4, 8),
        (128, 101, 4, 8),
        (128, 50, 4, 16),
        (128, 101, 4, 16),
    )
    def test_network_creation(self, input_size, model_id,
                              endpoint_filter_scale, output_stride):
        """Test creation of ResNet models."""
        tf.keras.backend.set_image_data_format('channels_last')

        network = resnet_deeplab.DilatedResNet(model_id=model_id,
                                               output_stride=output_stride)
        inputs = tf.keras.Input(shape=(input_size, input_size, 3),
                                batch_size=1)
        endpoints = network(inputs)
        print(endpoints)
        self.assertAllEqual([
            1, input_size / output_stride, input_size / output_stride,
            512 * endpoint_filter_scale
        ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list())

    @parameterized.parameters(
        ('v0', None, 0.0),
        ('v1', None, 0.0),
        ('v1', 0.25, 0.0),
        ('v1', 0.25, 0.2),
    )
    def test_network_features(self, stem_type, se_ratio,
                              init_stochastic_depth_rate):
        """Test additional features of ResNet models."""
        input_size = 128
        model_id = 50
        endpoint_filter_scale = 4
        output_stride = 8

        tf.keras.backend.set_image_data_format('channels_last')

        network = resnet_deeplab.DilatedResNet(
            model_id=model_id,
            output_stride=output_stride,
            stem_type=stem_type,
            se_ratio=se_ratio,
            init_stochastic_depth_rate=init_stochastic_depth_rate)
        inputs = tf.keras.Input(shape=(input_size, input_size, 3),
                                batch_size=1)
        endpoints = network(inputs)
        print(endpoints)
        self.assertAllEqual([
            1, input_size / output_stride, input_size / output_stride,
            512 * endpoint_filter_scale
        ], endpoints[str(int(np.math.log2(output_stride)))].shape.as_list())

    @combinations.generate(
        combinations.combine(
            strategy=[
                strategy_combinations.cloud_tpu_strategy,
                strategy_combinations.one_device_strategy_gpu,
            ],
            use_sync_bn=[False, True],
        ))
    def test_sync_bn_multiple_devices(self, strategy, use_sync_bn):
        """Test for sync bn on TPU and GPU devices."""
        inputs = np.random.rand(64, 128, 128, 3)

        tf.keras.backend.set_image_data_format('channels_last')

        with strategy.scope():
            network = resnet_deeplab.DilatedResNet(model_id=50,
                                                   output_stride=8,
                                                   use_sync_bn=use_sync_bn)
            _ = network(inputs)

    @parameterized.parameters(1, 3, 4)
    def test_input_specs(self, input_dim):
        """Test different input feature dimensions."""
        tf.keras.backend.set_image_data_format('channels_last')

        input_specs = tf.keras.layers.InputSpec(
            shape=[None, None, None, input_dim])
        network = resnet_deeplab.DilatedResNet(model_id=50,
                                               output_stride=8,
                                               input_specs=input_specs)

        inputs = tf.keras.Input(shape=(128, 128, input_dim), batch_size=1)
        _ = network(inputs)

    def test_serialize_deserialize(self):
        # Create a network object that sets all of its config options.
        kwargs = dict(
            model_id=50,
            output_stride=8,
            stem_type='v0',
            se_ratio=0.25,
            init_stochastic_depth_rate=0.2,
            use_sync_bn=False,
            activation='relu',
            norm_momentum=0.99,
            norm_epsilon=0.001,
            kernel_initializer='VarianceScaling',
            kernel_regularizer=None,
            bias_regularizer=None,
        )
        network = resnet_deeplab.DilatedResNet(**kwargs)

        expected_config = dict(kwargs)
        self.assertEqual(network.get_config(), expected_config)

        # Create another network object from the first object's config.
        new_network = resnet_deeplab.DilatedResNet.from_config(
            network.get_config())

        # Validate that the config can be forced to JSON.
        _ = new_network.to_json()

        # If the serialization was successful, the new config should match the old.
        self.assertAllEqual(network.get_config(), new_network.get_config())
Esempio n. 30
0
def strategy_minus_tpu_and_input_config_combinations_eager():
    return (combinations.times(
        combinations.combine(
            distribution=strategy_combinations.strategies_minus_tpu),
        eager_mode_test_configuration()))
def strategy_minus_tpu_combinations():
  return combinations.combine(
      distribution=strategies_minus_tpu,
      mode=['graph', 'eager'])
Esempio n. 32
0
class KerasMultiWorkerOptimizerTest(test_base.IndependentWorkerTestBase,
                                    parameterized.TestCase):

  def run_optimizer_comparison_with_simple_bias_model(
      self, strategy_cls, optimizer_class_1, optimizer_class_2):

    def get_input_datasets():
      # Simple training input.
      train_input = [[1]] * 16
      train_label = [[0]] * 16
      ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label))
      ds = maybe_shard_dataset(ds)
      # TODO(rchao): Investigate to figure out the reason for having 8 workers
      # instead of 2 as expected.
      return ds.batch(8, drop_remainder=True)

    def get_simple_bias_model():

      class Bias(base_layer.Layer):

        def build(self, input_shape):
          self.bias = self.add_variable('bias', (1,), initializer='zeros')

        def call(self, inputs):
          return inputs + self.bias

      model = sequential.Sequential()
      model.add(Bias(input_shape=(1,)))

      return model

    self._lock = threading.Lock()
    cluster_spec = test_base.create_cluster_spec(num_workers=2)
    self._barrier = dc._Barrier(2)

    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
      """Simulates an Independent Worker inside a thread."""
      # TODO(rchao): Refactor to abstract the common boilerplate out.
      with test.mock.patch.object(dc, '_run_std_server',
                                  self._make_mock_run_std_server()):

        model = get_simple_bias_model()

        initial_weights = model.get_weights()

        def _get_model_results(optimizer, initial_weights):

          # Clear Keras session to reset device assignment
          keras.backend._SESSION.session = None
          strategy = get_strategy_object(strategy_cls)

          with strategy.scope():
            train_ds = get_input_datasets()
            model = get_simple_bias_model()
            model.set_weights(initial_weights)
            model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])

          return {
              'trained_loss_and_accuracy':
                  model.fit(x=train_ds, epochs=20).history,
              'trained_weights':
                  model.get_weights(),
          }

        results1 = _get_model_results(optimizer_class_1(0.01), initial_weights)
        results2 = _get_model_results(optimizer_class_2(0.01), initial_weights)

        for key in results1:
          self.assertAllClose(
              results1[key],
              results2[key],
              atol=1e-5,
              rtol=1e-5,
              msg='Fail to assert {}'.format(key))

    threads = self.run_multiple_tasks_in_threads(_independent_worker_fn,
                                                 cluster_spec)

    threads_to_join = []
    strategy = get_strategy_object(strategy_cls)
    if strategy.extended.experimental_between_graph:
      for ts in threads.values():
        threads_to_join.extend(ts)
    else:
      threads_to_join = [threads['worker'][0]]
    self.join_independent_workers(threads_to_join)

  @combinations.generate(
      combinations.combine(
          mode=['graph'],
          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
          required_gpus=[0, 1]))
  def test_sgd_optimizer_v1_v2_comparison(self, strategy_cls):
    self.run_optimizer_comparison_with_simple_bias_model(
        strategy_cls, gradient_descent.SGD,
        gradient_descent_v1.GradientDescentOptimizer)

  @combinations.generate(
      combinations.combine(
          mode=['graph'],
          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
          required_gpus=[0, 1]))
  def test_rmsprop_optimizer_v1_v2_comparison(self, strategy_cls):
    self.skipTest('There is an issue in collective ops (b/127700538) that '
                  'prevent us from running this test with rmsprop optimizers.')
    self.run_optimizer_comparison_with_simple_bias_model(
        strategy_cls, rmsprop.RMSprop, rmsprop_v1.RMSPropOptimizer)
class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
  """Test multi-worker training flow demo'ed in go/multi-worker-with-keras."""

  @contextlib.contextmanager
  def skip_fetch_failure_exception(self):
    try:
      yield
    except zipfile.BadZipfile as e:
      self.skipTest('Data loading error: Bad magic number for file header.')
    except Exception as e:  # pylint: disable=broad-except
      if 'URL fetch failure' in str(e):
        self.skipTest('URL fetch error not considered failure of the test.')
      else:
        raise

  @combinations.generate(
      combinations.combine(
          mode=['eager'],
          shard_policy=[None] + list(distribute_options.AutoShardPolicy)))
  def testMultiWorkerTutorial(self, mode, shard_policy):
    """Test multi-worker training flow demo'ed in go/multi-worker-with-keras.

    This test should be kept in sync with the code samples in
    go/multi-worker-with-keras.

    Args:
      mode: Runtime mode.
      shard_policy: None or any of tf.data.experimental.AutoShardPolicy for
        testing.
    """
    if shard_policy is distribute_options.AutoShardPolicy.FILE:
      self.skipTest('TensorSliceDataset is not shardable with FILE policy.')

    def mnist_dataset(batch_size):
      with self.skip_fetch_failure_exception():
        (x_train, y_train), _ = mnist.load_data()
      # The `x` arrays are in uint8 and have values in the range [0, 255].
      # We need to convert them to float32 with values in the range [0, 1]
      x_train = x_train / np.float32(255)
      y_train = y_train.astype(np.int64)
      train_dataset = dataset_ops.DatasetV2.from_tensor_slices(
          (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
      return train_dataset

    def build_and_compile_cnn_model():
      model = keras.Sequential([
          keras.layers.Input(shape=(28, 28)),
          keras.layers.Reshape(target_shape=(28, 28, 1)),
          keras.layers.Conv2D(32, 3, activation='relu'),
          keras.layers.Flatten(),
          keras.layers.Dense(128, activation='relu'),
          keras.layers.Dense(10)
      ])
      model.compile(
          loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          optimizer=gradient_descent.SGD(learning_rate=0.001),
          metrics=['accuracy'])
      return model

    per_worker_batch_size = 64

    single_worker_dataset = mnist_dataset(per_worker_batch_size)
    single_worker_model = build_and_compile_cnn_model()
    single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)

    num_workers = 4

    def proc_func(model_path, checkpoint_dir):
      global_batch_size = per_worker_batch_size * num_workers
      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
      with strategy.scope():
        multi_worker_model = build_and_compile_cnn_model()

      callbacks = [
          keras.callbacks.ModelCheckpoint(
              filepath=os.path.join(self.get_temp_dir(), 'checkpoint'))
      ]

      multi_worker_dataset = mnist_dataset(global_batch_size)
      if shard_policy:
        options = dataset_ops.Options()
        options.experimental_distribute.auto_shard_policy = shard_policy
        multi_worker_dataset = multi_worker_dataset.with_options(options)

      multi_worker_model.fit(
          multi_worker_dataset,
          epochs=2,
          steps_per_epoch=20,
          callbacks=callbacks)

      def _is_chief(task_type, task_id):
        return task_type is None or task_type == 'chief' or (
            task_type == 'worker' and task_id == 0)

      def _get_temp_dir(dirpath, task_id):
        base_dirpath = 'workertemp_' + str(task_id)
        temp_dir = os.path.join(dirpath, base_dirpath)
        file_io.recursive_create_dir_v2(temp_dir)
        return temp_dir

      def write_filepath(filepath, task_type, task_id):
        dirpath = os.path.dirname(filepath)
        base = os.path.basename(filepath)
        if not _is_chief(task_type, task_id):
          dirpath = _get_temp_dir(dirpath, task_id)
        return os.path.join(dirpath, base)

      task_type, task_id = (strategy.cluster_resolver.task_type,
                            strategy.cluster_resolver.task_id)
      write_model_path = write_filepath(model_path, task_type, task_id)

      multi_worker_model.save(write_model_path)
      if not _is_chief(task_type, task_id):
        file_io.delete_recursively_v2(os.path.dirname(write_model_path))

      # Make sure chief finishes saving before non-chief's assertions.
      multi_process_runner.barrier().wait()

      if not file_io.file_exists(model_path):
        raise RuntimeError()
      if file_io.file_exists(write_model_path) != _is_chief(task_type, task_id):
        raise RuntimeError()

      loaded_model = keras.saving.save.load_model(model_path)
      loaded_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)

      checkpoint = tracking_util.Checkpoint(model=multi_worker_model)
      write_checkpoint_dir = write_filepath(checkpoint_dir, task_type, task_id)
      checkpoint_manager = checkpoint_management.CheckpointManager(
          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)

      checkpoint_manager.save()
      if not _is_chief(task_type, task_id):
        file_io.delete_recursively_v2(write_checkpoint_dir)

      # Make sure chief finishes saving before non-chief's assertions.
      multi_process_runner.barrier().wait()

      if not file_io.file_exists(checkpoint_dir):
        raise RuntimeError()
      if file_io.file_exists(write_checkpoint_dir) != _is_chief(
          task_type, task_id):
        raise RuntimeError()

      latest_checkpoint = checkpoint_management.latest_checkpoint(
          checkpoint_dir)
      checkpoint.restore(latest_checkpoint)
      multi_worker_model.fit(multi_worker_dataset, epochs=2, steps_per_epoch=20)

      logging.info('testMultiWorkerTutorial successfully ends')

    model_path = os.path.join(self.get_temp_dir(), 'model.tf')
    checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
    with test_util.skip_if_error(self, errors_impl.UnavailableError):
      mpr_result = multi_process_runner.run(
          proc_func,
          multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
          args=(model_path, checkpoint_dir),
          list_stdout=True)

    self.assertTrue(
        any([
            'testMultiWorkerTutorial successfully ends' in msg
            for msg in mpr_result.stdout
        ]))

    def extract_accuracy(worker_id, input_string):
      match = re.match(
          r'\[worker\-{}\].*accuracy: (\d+\.\d+).*'.format(worker_id),
          input_string)
      return None if match is None else float(match.group(1))

    for worker_id in range(num_workers):
      accu_result = nest.map_structure(
          lambda x: extract_accuracy(worker_id, x),  # pylint: disable=cell-var-from-loop
          mpr_result.stdout)
      self.assertTrue(
          any(accu_result), 'Every worker is supposed to have accuracy result.')
Esempio n. 34
0
class TestEstimatorDistributionStrategy(tf.test.TestCase,
                                        parameterized.TestCase):
    def setUp(self):
        super(TestEstimatorDistributionStrategy, self).setUp()
        strategy_combinations.set_virtual_cpus_to_at_least(3)
        self._base_dir = os.path.join(self.get_temp_dir(),
                                      'keras_to_estimator_strategy_test')
        tf.compat.v1.gfile.MakeDirs(self._base_dir)
        self._config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                                model_dir=self._base_dir)

    def tearDown(self):
        super(TestEstimatorDistributionStrategy, self).tearDown()
        tf.compat.v1.summary.FileWriterCache.clear()
        if os.path.isdir(self._base_dir):
            tf.compat.v1.gfile.DeleteRecursively(self._base_dir)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
        ],
                             mode=['graph'],
                             cloning=[True, False]))
    def test_train_functional_with_distribution_strategy(
            self, distribution, cloning):
        keras_model = simple_functional_model()
        keras_model.compile(
            loss='categorical_crossentropy',
            metrics=[keras.metrics.CategoricalAccuracy()],
            optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
            cloning=cloning)
        config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                          model_dir=self._base_dir,
                                          train_distribute=distribution,
                                          eval_distribute=distribution)
        with self.cached_session():
            est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                     config=config)
            before_eval_results = est_keras.evaluate(
                input_fn=get_ds_test_input_fn, steps=1)
            est_keras.train(input_fn=get_ds_train_input_fn,
                            steps=_TRAIN_SIZE / 16)
            after_eval_results = est_keras.evaluate(
                input_fn=get_ds_test_input_fn, steps=1)
            self.assertLess(after_eval_results['loss'],
                            before_eval_results['loss'])

        tf.compat.v1.summary.FileWriterCache.clear()
        tf.compat.v1.gfile.DeleteRecursively(self._config.model_dir)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
        ],
                             mode=['graph'],
                             cloning=[True, False]))
    def test_train_sequential_with_distribution_strategy(
            self, distribution, cloning):
        keras_model = simple_sequential_model()
        keras_model.compile(
            loss='categorical_crossentropy',
            metrics=[keras.metrics.CategoricalAccuracy()],
            optimizer=rmsprop_keras.RMSprop(learning_rate=0.01),
            cloning=cloning)
        config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                          model_dir=self._base_dir,
                                          train_distribute=distribution)
        with self.cached_session():
            est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                     config=config)
            before_eval_results = est_keras.evaluate(
                input_fn=get_ds_test_input_fn, steps=1)
            est_keras.train(input_fn=get_ds_train_input_fn,
                            steps=_TRAIN_SIZE / 16)
            after_eval_results = est_keras.evaluate(
                input_fn=get_ds_test_input_fn, steps=1)
            self.assertLess(after_eval_results['loss'],
                            before_eval_results['loss'])

        tf.compat.v1.summary.FileWriterCache.clear()
        tf.compat.v1.gfile.DeleteRecursively(self._config.model_dir)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
        ],
                             mode=['graph']))
    def test_multi_inputs_multi_outputs_with_input_fn_as_dict(
            self, distribution):
        train_data, test_data = get_multi_inputs_multi_outputs_data()

        def train_input_fn():
            input_dict = {
                'input_a': train_data['input_a'],
                'input_b': train_data['input_b'],
                'input_m': train_data['input_m'].astype(np.str)
            }
            output_dict = {
                'dense_2': train_data['output_c'],
                'dense_3': train_data['output_d']
            }
            return tf.compat.v1.data.Dataset.from_tensor_slices(
                (input_dict, output_dict)).batch(16)

        def eval_input_fn():
            input_dict = {
                'input_a': test_data['input_a'],
                'input_b': test_data['input_b'],
                'input_m': test_data['input_m'].astype(np.str)
            }
            output_dict = {
                'dense_2': test_data['output_c'],
                'dense_3': test_data['output_d']
            }
            return tf.compat.v1.data.Dataset.from_tensor_slices(
                (input_dict, output_dict)).batch(16)

        self.do_test_multi_inputs_multi_outputs_with_input_fn(
            distribution, train_input_fn, eval_input_fn)

    def do_test_multi_inputs_multi_outputs_with_input_fn(
            self, distribution, train_input_fn, eval_input_fn):
        config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                          model_dir=self._base_dir,
                                          train_distribute=distribution)
        with self.cached_session():
            model = multi_inputs_multi_outputs_model()
            est_keras = keras_lib.model_to_estimator(keras_model=model,
                                                     config=config)
            baseline_eval_results = est_keras.evaluate(input_fn=eval_input_fn,
                                                       steps=1)
            est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
            eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
            self.assertLess(eval_results['loss'],
                            baseline_eval_results['loss'])
Esempio n. 35
0
def tpu_strategy_combinations():
  return combinations.combine(distribution=tpu_strategies, mode=["graph"])
Esempio n. 36
0
class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):

  def setUp(self):
    strategy_combinations.set_virtual_cpus_to_at_least(3)
    super(LossUtilitiesTest, self).setUp()

  def testComputeAverageLossGlobalBatchSize(self):
    per_example_loss = [1, 2, 3, 4, 5]
    loss = nn_impl.compute_average_loss(per_example_loss, global_batch_size=10)
    self.assertEqual(self.evaluate(loss), 1.5)

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testComputeAverageLossDefaultGlobalBatchSize(self, distribution):
    # Without strategy - num replicas = 1
    per_example_loss = constant_op.constant([2.5, 6.2, 5.])
    loss = nn_impl.compute_average_loss(per_example_loss)
    self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.) / 3)

    # With strategy - num replicas = 2
    with distribution.scope():
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.compute_average_loss, args=(per_example_loss,))
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.) / 3)

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testComputeAverageLossSampleWeights(self, distribution):
    with distribution.scope():
      # Scalar sample weight
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.compute_average_loss,
          args=([2., 4., 6.],),
          kwargs={"sample_weight": 2})
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertAllClose(self.evaluate(loss), (2. + 4. + 6.) * 2. / 3)

      # Per example sample weight
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.compute_average_loss,
          args=([2., 4., 6.],),
          kwargs={"sample_weight": [0.3, 0.5, 0.2]})
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertAllClose(
          self.evaluate(loss), (2. * 0.3 + 4. * 0.5 + 6. * 0.2) / 3)

      # Time-step sample weight
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.compute_average_loss,
          args=([[2., 0.5], [4., 1.]],),
          kwargs={"sample_weight": [[0.3, 0.7], [0.2, 0.8]]})
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertAllClose(
          self.evaluate(loss), (2. * 0.3 + 0.5 * 0.7 + 4. * 0.2 + 1. * 0.8) / 2)

  def testComputeAverageLossInvalidSampleWeights(self):
    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
                                 (r"Incompatible shapes: \[3\] vs. \[2\]|"
                                  "Dimensions must be equal")):
      nn_impl.compute_average_loss([2.5, 6.2, 5.],
                                   sample_weight=[0.2, 0.8],
                                   global_batch_size=10)

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testComputeAverageLossDtype(self, distribution):
    with distribution.scope():
      per_example_loss = constant_op.constant([2., 4., 6.],
                                              dtype=dtypes.float64)
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.compute_average_loss,
          args=(per_example_loss,),
          kwargs={"sample_weight": 2})
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertEqual(loss.dtype, dtypes.float64)

  def testComputeAverageLossInvalidRank(self):
    per_example_loss = constant_op.constant(2)

    # Static rank
    with self.assertRaisesRegex(
        ValueError, "Invalid value passed for `per_example_loss`. "
        "Expected a tensor with at least rank 1,"):
      nn_impl.compute_average_loss(per_example_loss)

    with context.graph_mode():
      # Dynamic rank
      per_example_loss = array_ops.placeholder(dtype=dtypes.float32)
      loss = nn_impl.compute_average_loss(per_example_loss)

      with self.cached_session() as sess:
        with self.assertRaisesRegex(
            errors.InvalidArgumentError,
            "Invalid value passed for `per_example_loss`. "
            "Expected a tensor with at least rank 1."):
          sess.run(loss, {per_example_loss: 2})

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testComputeAverageLossInCrossReplicaContext(self, distribution):
    with distribution.scope():
      with self.assertRaisesRegex(
          RuntimeError,
          "You are calling `compute_average_loss` in cross replica context"):
        nn_impl.compute_average_loss([2, 3])

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testScaleRegularizationLoss(self, distribution):
    # Without strategy - num replicas = 1
    reg_losses = constant_op.constant([2.5, 6.2, 5.])
    loss = nn_impl.scale_regularization_loss(reg_losses)
    self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.))

    # With strategy - num replicas = 2
    with distribution.scope():
      per_replica_losses = distribution.experimental_run_v2(
          nn_impl.scale_regularization_loss, args=(reg_losses,))
      loss = distribution.reduce("SUM", per_replica_losses, axis=None)
      self.assertAllClose(self.evaluate(loss), (2.5 + 6.2 + 5.))

  @combinations.generate(
      combinations.combine(
          distribution=[
              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
          ],
          mode=["graph", "eager"]))
  def testScaleRegularizationLossInCrossReplicaContext(self, distribution):
    with distribution.scope():
      with self.assertRaisesRegex(
          RuntimeError, "You are calling `scale_regularization_loss` in "
          "cross replica context"):
        nn_impl.scale_regularization_loss([2, 3])
Esempio n. 37
0
    group_key = array_ops.identity(group_key)
    instance_key = array_ops.identity(instance_key)
    return _collective_ops.all_reduce_v2(t, group_size, group_key, instance_key,
                                         *args, **kwargs)

  @staticmethod
  def all_gather(t, group_size, group_key, instance_key, *args, **kwargs):
    group_size = array_ops.identity(group_size)
    group_key = array_ops.identity(group_key)
    instance_key = array_ops.identity(instance_key)
    return _collective_ops.all_gather_v2(t, group_size, group_key, instance_key,
                                         *args, **kwargs)


device_combination = (
    combinations.combine(device='CPU', communication='RING', required_gpus=0) +
    combinations.combine(
        device='GPU', communication=['RING', 'NCCL'], required_gpus=2))


@combinations.generate(
    combinations.times(
        combinations.combine(
            collective_ops=[
                combinations.NamedObject('v1', CollectiveOpsV1),
                combinations.NamedObject('v2', CollectiveOpsV2)
            ],
            mode='eager'), device_combination))
class CollectiveOpsTest(test.TestCase, parameterized.TestCase):

  def setUp(self):
Esempio n. 38
0
from tensorflow.python.eager import def_function
from tensorflow.python.framework import constant_op
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import errors
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import variables
from tensorflow.python.platform import test
from tensorflow.python.util import nest


@combinations.generate(
    combinations.combine(strategy=[
        strategy_combinations.multi_worker_mirrored_2x1_cpu,
        strategy_combinations.multi_worker_mirrored_2x1_gpu,
    ] + strategy_combinations.all_strategies,
                         mode=['eager']))
class StrategyTest(test.TestCase, parameterized.TestCase):
    def testCaptureReplicaId(self, strategy):
        m = {}

        @def_function.function
        def f():
            return ds_context.get_replica_context().replica_id_in_sync_group

        @def_function.function
        def g():
            # Make g() a stateful function so it's traced twice.
            if m.get('v', None) is None:
                m['v'] = variables.Variable(0.)
Esempio n. 39
0
from absl.testing import parameterized
import numpy as np

from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.layers import normalization
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import variables
from tensorflow.python.ops.losses import losses
from tensorflow.python.platform import test
from tensorflow.python.training import gradient_descent

all_combinations = combinations.combine(
    distribution=[
        strategy_combinations.one_device_strategy,
    ], mode=["graph"])


class NormalizationTest(test.TestCase, parameterized.TestCase):

  @combinations.generate(
      combinations.times(all_combinations,
                         combinations.combine(fused=[True, False])))
  def testBNWithZeroBatchInput(self, distribution, fused):
    with distribution.scope(), self.cached_session() as sess:
      bn_list = []
      inputs = np.random.random((0, 4, 4, 3)) + 100
      targets = np.random.random((0, 4, 4, 3))
      inputs_placeholder = array_ops.placeholder(
          dtype=dtypes.float32, shape=[None, 4, 4, 3])
class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
    @combinations.generate(
        combinations.combine(mode=['eager'],
                             file_format=['h5', 'tf'],
                             save_weights_only=[True, False]))
    def test_model_checkpoint_saves_on_chief_but_not_otherwise(
            self, file_format, mode, save_weights_only):
        def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
                test_obj, file_format):

            model, saving_filepath, train_ds, steps = _model_setup(
                test_obj, file_format)
            num_epoch = 2
            extension = os.path.splitext(saving_filepath)[1]

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves checkpoint but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'checkpoint_%s_%d%s' % (test_base.get_task_type(),
                                        test_base.get_task_index(), extension))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(checkpoint_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      validation_data=train_ds,
                      validation_steps=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(
                              filepath=saving_filepath,
                              save_weights_only=save_weights_only)
                      ])

            # If it's chief, the model should be saved; if not, the model shouldn't.
            test_obj.assertEqual(checkpoint_exists(saving_filepath),
                                 test_base.is_chief())

            # If it's chief, the model should be saved (`write_filepath` should
            # simply return `saving_filepath`); if not, i.e. for non-chief workers,
            # the temporary path generated by `write_filepath` should no longer
            # contain the checkpoint that has been deleted.
            test_obj.assertEqual(
                checkpoint_exists(
                    distributed_file_utils.write_filepath(
                        saving_filepath, model._distribution_strategy)),
                test_base.is_chief())

        multi_process_runner.run(
            proc_model_checkpoint_saves_on_chief_but_not_otherwise,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, file_format))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_model_checkpoint_works_with_same_file_path(self, mode):
        def proc_model_checkpoint_works_with_same_file_path(
                test_obj, saving_filepath):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(filepath=saving_filepath)
                      ])

            test_obj.assertTrue(file_io.file_exists(saving_filepath))

        saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')

        multi_process_runner.run(
            proc_model_checkpoint_works_with_same_file_path,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, saving_filepath))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
        class InterruptingCallback(callbacks.Callback):
            def on_epoch_begin(self, epoch, logs=None):
                if epoch == 2:
                    raise RuntimeError('Interrupting!')

        class AssertCallback(callbacks.Callback):
            def on_epoch_begin(self, epoch, logs=None):
                # the interruption happened on epoch 2 as specified in
                # InterruptingCallback, so the initial epoch after restart will begin
                # at 2.
                assert epoch > 1

        def proc_model_checkpoint_works_with_same_file_path(
                test_obj, saving_filepath):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 4

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))
            bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')

            try:
                model.fit(
                    x=train_ds,
                    epochs=num_epoch,
                    steps_per_epoch=steps,
                    callbacks=[
                        callbacks.ModelCheckpoint(filepath=saving_filepath),
                        callbacks.BackupAndRestore(backup_dir=bar_dir),
                        InterruptingCallback()
                    ])
            except RuntimeError as e:
                if 'Interrupting!' not in str(e):
                    raise

            multi_process_runner.barrier().wait()
            backup_filepath = os.path.join(bar_dir, 'checkpoint')
            test_obj.assertTrue(file_io.file_exists(backup_filepath))
            test_obj.assertTrue(file_io.file_exists(saving_filepath))

            model.fit(x=train_ds,
                      epochs=num_epoch,
                      steps_per_epoch=steps,
                      callbacks=[
                          callbacks.ModelCheckpoint(filepath=saving_filepath),
                          callbacks.BackupAndRestore(backup_dir=bar_dir),
                          AssertCallback()
                      ])
            multi_process_runner.barrier().wait()
            test_obj.assertFalse(file_io.file_exists(backup_filepath))
            test_obj.assertTrue(file_io.file_exists(saving_filepath))

        saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')

        multi_process_runner.run(
            proc_model_checkpoint_works_with_same_file_path,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, saving_filepath))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
        def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # Incorporate type/index information and thread id in saving_filepath to
            # ensure every worker has a unique path. Note that in normal use case the
            # saving_filepath will be the same for all workers, but we use different
            # ones here just to test out chief saves summaries but non-chief doesn't.
            saving_filepath = os.path.join(
                test_obj.get_temp_dir(), 'logfile_%s_%d' %
                (test_base.get_task_type(), test_base.get_task_index()))

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

            # If it's chief, the summaries should be saved in the filepath; if not,
            # the directory should be empty (although created). Using
            # `file_io.list_directory()` since the directory may be created at this
            # point.
            test_obj.assertEqual(bool(file_io.list_directory(saving_filepath)),
                                 test_base.is_chief())

        multi_process_runner.run(
            proc_tensorboard_saves_on_chief_but_not_otherwise,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, ))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):
        def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(
                test_obj):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            saving_filepath = os.path.join(
                test_obj.get_temp_dir(),
                'logfile_%s' % (test_base.get_task_type()))

            saving_filepath_for_temp = os.path.join(saving_filepath,
                                                    'workertemp_1')
            os.mkdir(saving_filepath)
            os.mkdir(saving_filepath_for_temp)

            # Verifies that even if `saving_filepath_for_temp` exists, tensorboard
            # can still save to temporary directory.
            test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp))

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

        multi_process_runner.run(
            proc_tensorboard_can_still_save_to_temp_even_if_it_exists,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, ))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_tensorboard_works_with_same_file_path(self, mode):
        def proc_tensorboard_works_with_same_file_path(test_obj,
                                                       saving_filepath):
            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            num_epoch = 2

            # The saving_filepath shouldn't exist at the beginning (as it's unique).
            test_obj.assertFalse(file_io.file_exists(saving_filepath))

            multi_process_runner.barrier().wait()

            model.fit(
                x=train_ds,
                epochs=num_epoch,
                steps_per_epoch=steps,
                callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

            multi_process_runner.barrier().wait()

            test_obj.assertTrue(file_io.list_directory(saving_filepath))

        saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')

        multi_process_runner.run(
            proc_tensorboard_works_with_same_file_path,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, saving_filepath))

    @combinations.generate(combinations.combine(mode=['eager']))
    def test_early_stopping(self, mode):
        def proc_early_stopping(test_obj):
            class EpochCounterCallback(callbacks.Callback):
                def on_epoch_begin(self, epoch, logs):
                    self.last_epoch = epoch

            model, _, train_ds, steps = _model_setup(test_obj, file_format='')
            epoch_counter_cbk = EpochCounterCallback()
            cbks = [
                callbacks.EarlyStopping(monitor='loss',
                                        min_delta=0.05,
                                        patience=1,
                                        verbose=1), epoch_counter_cbk
            ]

            # Empirically, it is expected that `model.fit()` terminates around the
            # 22th epoch. Asserting that it should have been stopped before the 50th
            # epoch to avoid flakiness and be more predictable.
            model.fit(x=train_ds,
                      epochs=100,
                      steps_per_epoch=steps,
                      callbacks=cbks)
            test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)

        multi_process_runner.run(
            proc_early_stopping,
            cluster_spec=test_base.create_cluster_spec(num_workers=2),
            args=(self, ))
def eager_mode_test_configuration():
  return combinations.combine(mode='eager',
                              use_numpy=False,
                              use_validation_data=False)
Esempio n. 42
0
def all_strategy_and_input_config_combinations():
    return (combinations.times(
        combinations.combine(distribution=all_strategies,
                             experimental_run_tf_function=[True, False]),
        eager_mode_test_configuration() + graph_mode_test_configuration()))
def all_strategy_combinations_with_graph_mode():
  return (combinations.combine(
      distribution=keras_correctness_test_base.all_strategies,
      mode=['graph'],
      cloning=[True, False]))
def all_strategy_combinations_with_graph_mode():
    return (combinations.combine(
        distribution=keras_correctness_test_base.all_strategies,
        mode=['graph'],
        run_distributed=[True, False]))
class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):

  @combinations.generate(
      combinations.combine(
          mode=['eager'],
          file_format=['h5', 'tf'],
          save_weights_only=[True, False]))
  def test_model_checkpoint_saves_on_chief_but_not_otherwise(
      self, file_format, mode, save_weights_only):

    def proc_model_checkpoint_saves_on_chief_but_not_otherwise(
        test_obj, file_format):
      model, saving_filepath, train_ds, steps = _model_setup(
          test_obj, file_format)
      num_epoch = 2
      extension = os.path.splitext(saving_filepath)[1]

      # Incorporate type/index information and thread id in saving_filepath to
      # ensure every worker has a unique path. Note that in normal use case the
      # saving_filepath will be the same for all workers, but we use different
      # ones here just to test out chief saves checkpoint but non-chief doesn't.
      saving_filepath = os.path.join(
          test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
          (test_base.get_task_type(), test_base.get_task_index(), extension))

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[
              callbacks.ModelCheckpoint(
                  filepath=saving_filepath, save_weights_only=save_weights_only)
          ])

      # If it's chief, the model should be saved; if not, the model shouldn't.
      test_obj.assertEqual(
          training_state.checkpoint_exists(saving_filepath),
          test_base.is_chief())

    # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved.
    with multi_process_runner_util.try_run_and_except_connection_error(self):
      multi_process_runner.run(
          proc_model_checkpoint_saves_on_chief_but_not_otherwise,
          cluster_spec=test_base.create_cluster_spec(num_workers=2),
          args=(self, file_format))

  @combinations.generate(combinations.combine(mode=['eager']))
  def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):

    def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      # Incorporate type/index information and thread id in saving_filepath to
      # ensure every worker has a unique path. Note that in normal use case the
      # saving_filepath will be the same for all workers, but we use different
      # ones here just to test out chief saves summaries but non-chief doesn't.
      saving_filepath = os.path.join(
          test_obj.get_temp_dir(), 'logfile_%s_%d' %
          (test_base.get_task_type(), test_base.get_task_index()))

      # The saving_filepath shouldn't exist at the beginning (as it's unique).
      test_obj.assertFalse(file_io.file_exists(saving_filepath))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

      # If it's chief, the summaries should be saved in the filepath; if not,
      # the directory should be empty (although created). Using
      # `file_io.list_directory()` since the directory may be created at this
      # point.
      test_obj.assertEqual(
          bool(file_io.list_directory(saving_filepath)), test_base.is_chief())

    # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved.
    with multi_process_runner_util.try_run_and_except_connection_error(self):
      multi_process_runner.run(
          proc_tensorboard_saves_on_chief_but_not_otherwise,
          cluster_spec=test_base.create_cluster_spec(num_workers=2),
          args=(self,))

  @combinations.generate(combinations.combine(mode=['eager']))
  def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):

    def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
      num_epoch = 2

      saving_filepath = os.path.join(test_obj.get_temp_dir(),
                                     'logfile_%s' % (test_base.get_task_type()))

      saving_filepath_for_temp = os.path.join(saving_filepath, 'workertemp_1')
      os.mkdir(saving_filepath)
      os.mkdir(saving_filepath_for_temp)

      # Verifies that even if `saving_filepath_for_temp` exists, tensorboard
      # can still save to temporary directory.
      test_obj.assertTrue(file_io.file_exists(saving_filepath_for_temp))

      model.fit(
          x=train_ds,
          epochs=num_epoch,
          steps_per_epoch=steps,
          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])

    # TODO(b/141948186): Remove this `with` block once b/141948186 is resolved.
    with multi_process_runner_util.try_run_and_except_connection_error(self):
      multi_process_runner.run(
          proc_tensorboard_can_still_save_to_temp_even_if_it_exists,
          cluster_spec=test_base.create_cluster_spec(num_workers=2),
          args=(self,))
Esempio n. 46
0
        context.LogicalDeviceConfiguration(64),
        context.LogicalDeviceConfiguration(64),
    ])
    collective_all_reduce_strategy.CollectiveAllReduceStrategy(
        cluster_resolver=resolver)
    # Since we create two logical GPUs out of the last GPU, there should be one
    # more logical GPUs than physical GPUs.
    self.assertLen(tf_config.list_logical_devices('GPU'), len(gpus) + 1)
    context._reset_context()  # pylint: disable=protected-access


@combinations.generate(
    combinations.combine(
        strategy=[
            strategy_combinations.multi_worker_mirrored_2x1_cpu,
            strategy_combinations.multi_worker_mirrored_2x1_gpu,
            strategy_combinations.multi_worker_mirrored_2x2_gpu,
        ],
        mode=['eager']))
class CollectiveAllReduceStrategyV2Test(test.TestCase, parameterized.TestCase):

  def test_replica_id_in_sync_group(self, strategy):

    def replica_fn():
      replica_ctx = distribution_strategy_context.get_replica_context()
      return replica_ctx.replica_id_in_sync_group, replica_ctx._replica_id

    results = test_util.gather(strategy, strategy.run(replica_fn))
    self.assertAllEqual(list(range(strategy.extended._num_replicas_in_sync)),
                        results[0].numpy())
    self.assertAllEqual(
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from tensorflow.python.distribute import strategy_test_lib
from tensorflow.python.eager import context
from tensorflow.python.eager import test


@combinations.generate(
    combinations.combine(
        distribution=[
            strategy_combinations.one_device_strategy,
            strategy_combinations.one_device_strategy_gpu
        ],
        mode=["eager", "graph"]))
class OneDeviceStrategyTest(
    strategy_test_lib.DistributionTestBase,
    strategy_test_lib.OneDeviceDistributionTestBase):

  def testMinimizeLoss(self, distribution):
    if context.executing_eagerly():
      self._test_minimize_loss_eager(distribution)
    else:
      self._test_minimize_loss_graph(distribution)

  def testReplicaId(self, distribution):
    self._test_replica_id(distribution)
Esempio n. 48
0
class DistributedCollectiveAllReduceStrategyTest(
    CollectiveAllReduceStrategyTestBase,
    strategy_test_lib.DistributionTestBase,
    parameterized.TestCase):

  @classmethod
  def setUpClass(cls):
    """Create a local cluster with 3 workers."""
    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
        num_workers=3, num_ps=0)

  @combinations.generate(combinations.combine(mode=['graph']))
  def test_num_replicas_in_sync(self):
    distribution, _, _ = create_test_objects(
        cluster_spec=self._cluster_spec,
        task_type='worker',
        task_id=0,
        num_gpus=2)
    num_workers = len(self._cluster_spec.get('chief', []) +
                      self._cluster_spec.get('worker', []))
    self.assertEqual(2 * num_workers,
                     distribution.num_replicas_in_sync)

  @combinations.generate(combinations.combine(
      mode=['graph'],
      prefetch_to_device=[None, True]))
  def test_prefetch_to_device_dataset(self, prefetch_to_device):
    distribution, _, _ = self._get_test_object(
        task_type='worker',
        task_id=0,
        num_gpus=2)
    if prefetch_to_device is None:
      input_options = None
    else:
      input_options = distribute_lib.InputOptions(
          experimental_prefetch_to_device=prefetch_to_device)
    dataset = dataset_ops.Dataset.range(100)
    dataset = dataset.batch(distribution.num_replicas_in_sync)
    dataset = distribution.experimental_distribute_dataset(
        dataset, options=input_options)
    if isinstance(dataset, input_lib.DistributedDatasetV1):
      item = dataset.make_initializable_iterator().get_next()
    else:
      self.skipTest('unsupported test combination')
    device_types = {
        tf_device.DeviceSpec.from_string(tensor.device).device_type for
        tensor in item.values}
    self.assertAllEqual(list(device_types), ['GPU'])

  @combinations.generate(combinations.combine(mode=['graph']))
  def test_prefetch_to_host_dataset(self):
    distribution, _, _ = self._get_test_object(
        task_type='worker',
        task_id=0,
        num_gpus=2)
    input_options = distribute_lib.InputOptions(
        experimental_prefetch_to_device=False)
    dataset = dataset_ops.Dataset.range(100)
    dataset = dataset.batch(distribution.num_replicas_in_sync)
    dataset = distribution.experimental_distribute_dataset(
        dataset, options=input_options)
    if isinstance(dataset, input_lib.DistributedDatasetV1):
      item = dataset.make_initializable_iterator().get_next()
    else:
      self.skipTest('unsupported test combination')
    device_types = {
        tf_device.DeviceSpec.from_string(tensor.device).device_type for
        tensor in item.values}
    self.assertAllEqual(list(device_types), ['CPU'])

  @combinations.generate(
      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
  def testMinimizeLossGraph(self, required_gpus):
    self._run_between_graph_clients(self._test_minimize_loss_graph,
                                    self._cluster_spec, required_gpus)

  @combinations.generate(
      combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
  def testVariableInitialization(self, required_gpus):
    self._run_between_graph_clients(
        self._test_variable_initialization,
        self._cluster_spec,
        num_gpus=required_gpus)

  @combinations.generate(
      combinations.combine(
          mode=['graph'], required_gpus=[0, 1, 2], use_dataset=[True, False]))
  def testMakeInputFnIterator(self, required_gpus, use_dataset):
    def _worker_fn(task_type, task_id, required_gpus):
      if use_dataset:
        fn = lambda: dataset_ops.Dataset.range(20)
      else:
        def fn():
          dataset = dataset_ops.Dataset.range(20)
          it = dataset_ops.make_one_shot_iterator(dataset)
          return it.get_next
      # We use CPU as the device when required_gpus = 0
      devices_per_worker = max(1, required_gpus)
      expected_values = [[i+j for j in range(devices_per_worker)]
                         for i in range(0, 20, devices_per_worker)]

      input_fn = self._input_fn_to_test_input_context(
          fn,
          expected_num_replicas_in_sync=3*devices_per_worker,
          expected_num_input_pipelines=3,
          expected_input_pipeline_id=task_id)
      self._test_input_fn_iterator(
          task_type,
          task_id,
          required_gpus,
          input_fn,
          expected_values,
          test_reinitialize=use_dataset,
          ignore_order=not use_dataset)

    self._run_between_graph_clients(_worker_fn, self._cluster_spec,
                                    required_gpus)

  @combinations.generate(combinations.combine(mode=['graph']))
  def testUpdateConfigProto(self):
    strategy, _, _ = self._get_test_object(
        task_type='worker', task_id=1, num_gpus=2)

    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
    rewrite_options = config_proto.graph_options.rewrite_options
    rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')

    new_config = strategy.update_config_proto(config_proto)

    # Verify group leader
    self.assertEqual('/job:worker/replica:0/task:0',
                     new_config.experimental.collective_group_leader)

    # Verify device filters.
    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)

    # Verify rewrite options.
    new_rewrite_options = new_config.graph_options.rewrite_options
    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
                     new_rewrite_options.scoped_allocator_optimization)
    self.assertEqual(['CollectiveReduce'],
                     new_rewrite_options.scoped_allocator_opts.enable_op)
Esempio n. 49
0
class TrainLibTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
    super().setUp()
    self._test_config = {
        'trainer': {
            'checkpoint_interval': 10,
            'steps_per_loop': 10,
            'summary_interval': 10,
            'train_steps': 10,
            'validation_steps': 5,
            'validation_interval': 10,
            'continuous_eval_timeout': 1,
            'optimizer_config': {
                'optimizer': {
                    'type': 'sgd',
                },
                'learning_rate': {
                    'type': 'constant'
                }
            }
        },
    }

  @combinations.generate(
      combinations.combine(
          distribution_strategy=[
              strategy_combinations.default_strategy,
              strategy_combinations.cloud_tpu_strategy,
              strategy_combinations.one_device_strategy_gpu,
          ],
          mode='eager',
          flag_mode=['train', 'eval', 'train_and_eval']))
  def test_end_to_end(self, distribution_strategy, flag_mode):
    model_dir = self.get_temp_dir()
    experiment_config = configs.MultiTaskExperimentConfig(
        task=configs.MultiTaskConfig(
            task_routines=(
                configs.TaskRoutine(
                    task_name='foo', task_config=test_utils.FooConfig()),
                configs.TaskRoutine(
                    task_name='bar', task_config=test_utils.BarConfig()))))
    experiment_config = params_dict.override_params_dict(
        experiment_config, self._test_config, is_strict=False)
    with distribution_strategy.scope():
      test_multitask = multitask.MultiTask.from_config(experiment_config.task)
      model = test_utils.MockMultiTaskModel()
    train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=test_multitask,
        model=model,
        mode=flag_mode,
        params=experiment_config,
        model_dir=model_dir)

  @combinations.generate(
      combinations.combine(
          distribution_strategy=[
              strategy_combinations.default_strategy,
              strategy_combinations.cloud_tpu_strategy,
              strategy_combinations.one_device_strategy_gpu,
          ],
          mode='eager',
          flag_mode=['train', 'eval', 'train_and_eval']))
  def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
    model_dir = self.get_temp_dir()
    experiment_config = configs.MultiEvalExperimentConfig(
        task=test_utils.FooConfig(),
        eval_tasks=(configs.TaskRoutine(
            task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2),
                    configs.TaskRoutine(
                        task_name='bar',
                        task_config=test_utils.BarConfig(),
                        eval_steps=3)))
    experiment_config = params_dict.override_params_dict(
        experiment_config, self._test_config, is_strict=False)
    with distribution_strategy.scope():
      train_task = task_factory.get_task(experiment_config.task)
      eval_tasks = [
          task_factory.get_task(config.task_config, name=config.task_name)
          for config in experiment_config.eval_tasks
      ]
    train_lib.run_experiment_with_multitask_eval(
        distribution_strategy=distribution_strategy,
        train_task=train_task,
        eval_tasks=eval_tasks,
        mode=flag_mode,
        params=experiment_config,
        model_dir=model_dir)
Esempio n. 50
0
class LocalCollectiveAllReduceStrategy(
    CollectiveAllReduceStrategyTestBase,
    strategy_test_lib.DistributionTestBase,
    strategy_test_lib.TwoDeviceDistributionTestBase,
    parameterized.TestCase):

  @combinations.generate(
      combinations.combine(mode=['graph', 'eager'], required_gpus=[2, 4]))
  def testMinimizeLoss(self, required_gpus):
    # Collective ops doesn't support strategy with one device.
    if context.executing_eagerly():
      strategy, _, _ = self._get_test_object(None, None, required_gpus)
      self._test_minimize_loss_eager(strategy)
    else:
      self._test_minimize_loss_graph(None, None, required_gpus)

  @combinations.generate(
      combinations.combine(
          mode=['graph'], required_gpus=2, use_dataset=[True, False]))
  def testMakeInputFnIterator(self, required_gpus, use_dataset):
    if use_dataset:
      fn = lambda: dataset_ops.Dataset.range(5 * required_gpus)
    else:
      def fn():
        dataset = dataset_ops.Dataset.range(5 * required_gpus)
        it = dataset_ops.make_one_shot_iterator(dataset)
        return it.get_next

    expected_values = [
        range(i, i + required_gpus) for i in range(0, 10, required_gpus)
    ]

    input_fn = self._input_fn_to_test_input_context(
        fn,
        expected_num_replicas_in_sync=required_gpus,
        expected_num_input_pipelines=1,
        expected_input_pipeline_id=0)
    self._test_input_fn_iterator(
        None,
        None,
        required_gpus,
        input_fn,
        expected_values,
        test_reinitialize=use_dataset,
        ignore_order=not use_dataset)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceSum(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_sum(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceSumGradients(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_sum_gradients(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceSumGradientTape(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_sum_gradient_tape(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceMean(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_mean(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceMeanGradients(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_mean_gradients(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testAllReduceMeanGradientTape(self, required_gpus):
    distribution, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    with self.cached_session(config=config, target=target):
      self._test_all_reduce_mean_gradient_tape(distribution)

  @combinations.generate(combinations.combine(mode=['graph'], required_gpus=2))
  def testNumpyDataset(self, required_gpus):
    strategy, target, config = self._get_test_object(
        None, None, num_gpus=required_gpus)
    self._test_numpy_dataset(
        strategy, session=self.cached_session(config=config, target=target))
def strategy_for_numpy_input_combinations():
  return combinations.combine(
      distribution=strategies_minus_tpu + tpu_strategies,
      mode=['graph'])
Esempio n. 52
0
class ParameterServerStrategyTest(
        ParameterServerStrategyTestBase,
        strategy_test_lib.DistributionTestBase,
        strategy_test_lib.TwoDeviceDistributionTestBase,
        parameterized.TestCase):
    @classmethod
    def setUpClass(cls):
        cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
            num_workers=3, num_ps=2)
        cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def test_num_replicas_in_sync(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=2, use_core_strategy=use_core_strategy)
        # All the devices on a given worker are in sync which in this case is the
        # number of gpus on each worker.
        self.assertEqual(2, strategy.num_replicas_in_sync)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testDeviceAssignmentLocalCPU(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=0, use_core_strategy=use_core_strategy)
        self._test_device_assignment_local(strategy,
                                           compute_device='CPU',
                                           variable_device='CPU',
                                           num_gpus=0)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testDeviceAssignmentLocalOneGPU(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=1, use_core_strategy=use_core_strategy)
        self._test_device_assignment_local(strategy,
                                           compute_device='GPU',
                                           variable_device='GPU',
                                           num_gpus=1)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=2, use_core_strategy=use_core_strategy)
        self._test_device_assignment_local(strategy,
                                           compute_device='GPU',
                                           variable_device='CPU',
                                           num_gpus=2)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy):
        self._test_device_assignment_distributed(
            'worker', 1, num_gpus, use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testDeviceAssignmentDistributedEnablePartitioner(
            self, num_gpus, use_core_strategy):
        self._test_device_assignment_distributed_enable_partitioner(
            'worker', 1, num_gpus, use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testSimpleBetweenGraph(self, use_core_strategy):
        self._run_between_graph_clients(self._test_simple_increment,
                                        self._cluster_spec,
                                        context.num_gpus(),
                                        use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testLocalSimpleIncrement(self, num_gpus, use_core_strategy):
        self._test_simple_increment(None, 0, num_gpus, use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy):
        self._run_between_graph_clients(self._test_minimize_loss_graph,
                                        self._cluster_spec,
                                        num_gpus,
                                        use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
        self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)

    # TODO(priyag): Refactor this and other multi worker tests.
    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[1, 2],
                             required_gpus=1,
                             use_core_strategy=[True, False],
                             use_dataset=[True, False]))
    def testMakeInputFnIteratorDistributed(self, num_gpus, use_core_strategy,
                                           use_dataset):
        if context.num_gpus() < num_gpus:
            self.skipTest('Not enough GPUs')
        if use_dataset:
            fn = lambda: dataset_ops.Dataset.range(100)
        else:

            def fn():
                dataset = dataset_ops.Dataset.range(100)
                it = dataset.make_one_shot_iterator()
                return it.get_next

        expected_values = [[i + j for j in range(num_gpus)]
                           for i in range(0, 100, num_gpus)]

        input_fn = self._input_fn_to_test_input_context(
            fn,
            expected_num_replicas_in_sync=num_gpus,
            expected_num_input_pipelines=3,
            expected_input_pipeline_id=1)  # because task_id = 1
        self._test_input_fn_iterator('worker',
                                     1,
                                     num_gpus,
                                     input_fn,
                                     expected_values,
                                     test_reinitialize=use_dataset,
                                     ignore_order=not use_dataset,
                                     use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[1, 2],
                             required_gpus=1,
                             use_core_strategy=[True, False],
                             use_dataset=[True, False]))
    def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
                                     use_dataset):
        if context.num_gpus() < num_gpus:
            self.skipTest('Not enough GPUs')
        if use_dataset:
            fn = lambda: dataset_ops.Dataset.range(100)
        else:

            def fn():
                dataset = dataset_ops.Dataset.range(100)
                it = dataset.make_one_shot_iterator()
                return it.get_next

        expected_values = [[i + j for j in range(num_gpus)]
                           for i in range(0, 100, num_gpus)]

        input_fn = self._input_fn_to_test_input_context(
            fn,
            expected_num_replicas_in_sync=num_gpus,
            expected_num_input_pipelines=1,
            expected_input_pipeline_id=0
        )  # only one worker and pipeline for local.
        self._test_input_fn_iterator(None,
                                     None,
                                     num_gpus,
                                     input_fn,
                                     expected_values,
                                     test_reinitialize=use_dataset,
                                     ignore_order=not use_dataset,
                                     use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testGlobalStepUpdate(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            use_core_strategy=use_core_strategy)
        self._test_global_step_update(strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            cluster_spec=self._cluster_spec,
            task_type='worker',
            task_id=1,
            num_gpus=2,
            use_core_strategy=use_core_strategy)

        config_proto = config_pb2.ConfigProto(
            device_filters=['to_be_overridden'])

        new_config = strategy.update_config_proto(config_proto)

        # Verify device filters.
        self.assertEqual(['/job:worker/task:1', '/job:ps'],
                         new_config.device_filters)

        # Verify isolate_session_state
        self.assertFalse(new_config.isolate_session_state)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testUpdateConfigProtoLocal(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=2, use_core_strategy=use_core_strategy)

        config_proto = config_pb2.ConfigProto()
        new_config = strategy.update_config_proto(config_proto)

        # Verify isolate_session_state
        self.assertTrue(new_config.isolate_session_state)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceSum(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_sum(distribution)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceSumGradients(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_sum_gradients(distribution)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceSumGradientTape(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_sum_gradient_tape(distribution)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceMean(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_mean(distribution)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceMeanGradients(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_mean_gradients(distribution)

    @combinations.generate(combinations.combine(required_gpus=[2]))
    def testAllReduceMeanGradientTape(self):
        distribution = parameter_server_strategy.ParameterServerStrategy(
            num_gpus_per_worker=2)
        self._test_all_reduce_mean_gradient_tape(distribution)

    def testTrainableVariables(self):
        distribution = parameter_server_strategy.ParameterServerStrategy()
        self._test_trainable_variable(distribution)
Esempio n. 53
0
def strategy_minus_tpu_combinations():
  return combinations.combine(
      distribution=strategies_minus_tpu, mode=["graph", "eager"])
Esempio n. 54
0
class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                           parameterized.TestCase):
    @classmethod
    def setUpClass(cls):
        cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
            num_workers=3, num_ps=2, has_chief=True)
        cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testSimpleBetweenGraph(self, use_core_strategy):
        self._run_between_graph_clients(self._test_simple_increment,
                                        self._cluster_spec,
                                        context.num_gpus(),
                                        use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'],
                             num_gpus=[0, 1, 2],
                             use_core_strategy=[True, False]))
    def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
        self._run_between_graph_clients(self._test_minimize_loss_graph,
                                        self._cluster_spec,
                                        num_gpus,
                                        use_core_strategy=use_core_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=2, use_core_strategy=use_core_strategy)
        with ops.Graph().as_default(), strategy.scope():
            created_step = training_util.create_global_step()
            get_step = training_util.get_global_step()
            self.assertEqual(
                created_step,
                get_step,
                msg=('created_step %s type %s vs. get_step %s type %s' %
                     (id(created_step), created_step.__class__.__name__,
                      id(get_step), get_step.__class__.__name__)))
            self.assertIs(values.AggregatingVariable, type(created_step))
            self.assertIs(values.AggregatingVariable, type(get_step))
            self.assertIs(strategy, created_step.distribute_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=1, use_core_strategy=use_core_strategy)
        with ops.Graph().as_default(), strategy.scope():
            created_step = training_util.create_global_step()
            get_step = training_util.get_global_step()
            self.assertEqual(
                created_step,
                get_step,
                msg=('created_step %s type %s vs. get_step %s type %s' %
                     (id(created_step), created_step.__class__.__name__,
                      id(get_step), get_step.__class__.__name__)))
            self.assertIs(resource_variable_ops.ResourceVariable,
                          type(created_step))
            self.assertIs(resource_variable_ops.ResourceVariable,
                          type(get_step))
            # All variables have an _distribute_strategy parameter. Only variable
            # subclasses in distribution strategy expose it publicly.
            self.assertFalse(hasattr(strategy, 'distribute_strategy'))
            self.assertIs(strategy, created_step._distribute_strategy)

    @combinations.generate(
        combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
    def testValueContainer(self, use_core_strategy):
        strategy, _, _ = create_test_objects(
            num_gpus=2, use_core_strategy=use_core_strategy)
        with ops.Graph().as_default(), strategy.scope():

            def f():
                with backprop.GradientTape() as tape:
                    v = variable_scope.get_variable('v', initializer=10.0)
                    _ = v * v
                v, = tape.watched_variables()
                w = strategy.extended.value_container(v)
                self.assertIs(values.AggregatingVariable, type(w))

            strategy.extended.call_for_each_replica(f)
Esempio n. 55
0
class FactoryTest(tf.test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.combine(model_id=[18, 34, 50, 101, 152], ))
    def test_resnet_creation(self, model_id):
        """Test creation of ResNet models."""

        network = backbones.ResNet(model_id=model_id,
                                   se_ratio=0.0,
                                   norm_momentum=0.99,
                                   norm_epsilon=1e-5)

        backbone_config = backbones_cfg.Backbone(type='resnet',
                                                 resnet=backbones_cfg.ResNet(
                                                     model_id=model_id,
                                                     se_ratio=0.0))
        norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99,
                                                           norm_epsilon=1e-5,
                                                           use_sync_bn=False)

        factory_network = factory.build_backbone(
            input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
            backbone_config=backbone_config,
            norm_activation_config=norm_activation_config)

        network_config = network.get_config()
        factory_network_config = factory_network.get_config()

        self.assertEqual(network_config, factory_network_config)

    @combinations.generate(
        combinations.combine(
            model_id=['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
            se_ratio=[0.0, 0.25],
        ))
    def test_efficientnet_creation(self, model_id, se_ratio):
        """Test creation of EfficientNet models."""

        network = backbones.EfficientNet(model_id=model_id,
                                         se_ratio=se_ratio,
                                         norm_momentum=0.99,
                                         norm_epsilon=1e-5)

        backbone_config = backbones_cfg.Backbone(
            type='efficientnet',
            efficientnet=backbones_cfg.EfficientNet(model_id=model_id,
                                                    se_ratio=se_ratio))
        norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99,
                                                           norm_epsilon=1e-5,
                                                           use_sync_bn=False)

        factory_network = factory.build_backbone(
            input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
            backbone_config=backbone_config,
            norm_activation_config=norm_activation_config)

        network_config = network.get_config()
        factory_network_config = factory_network.get_config()

        self.assertEqual(network_config, factory_network_config)

    @combinations.generate(
        combinations.combine(
            model_id=[
                'MobileNetV1', 'MobileNetV2', 'MobileNetV3Large',
                'MobileNetV3Small', 'MobileNetV3EdgeTPU'
            ],
            filter_size_scale=[1.0, 0.75],
        ))
    def test_mobilenet_creation(self, model_id, filter_size_scale):
        """Test creation of Mobilenet models."""

        network = backbones.MobileNet(model_id=model_id,
                                      filter_size_scale=filter_size_scale,
                                      norm_momentum=0.99,
                                      norm_epsilon=1e-5)

        backbone_config = backbones_cfg.Backbone(
            type='mobilenet',
            mobilenet=backbones_cfg.MobileNet(
                model_id=model_id, filter_size_scale=filter_size_scale))
        norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99,
                                                           norm_epsilon=1e-5,
                                                           use_sync_bn=False)

        factory_network = factory.build_backbone(
            input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
            backbone_config=backbone_config,
            norm_activation_config=norm_activation_config)

        network_config = network.get_config()
        factory_network_config = factory_network.get_config()

        self.assertEqual(network_config, factory_network_config)

    @combinations.generate(combinations.combine(model_id=['49'], ))
    def test_spinenet_creation(self, model_id):
        """Test creation of SpineNet models."""
        input_size = 128
        min_level = 3
        max_level = 7

        input_specs = tf.keras.layers.InputSpec(
            shape=[None, input_size, input_size, 3])
        network = backbones.SpineNet(input_specs=input_specs,
                                     min_level=min_level,
                                     max_level=max_level,
                                     norm_momentum=0.99,
                                     norm_epsilon=1e-5)

        backbone_config = backbones_cfg.Backbone(
            type='spinenet',
            spinenet=backbones_cfg.SpineNet(model_id=model_id))
        norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99,
                                                           norm_epsilon=1e-5,
                                                           use_sync_bn=False)

        factory_network = factory.build_backbone(
            input_specs=tf.keras.layers.InputSpec(
                shape=[None, input_size, input_size, 3]),
            backbone_config=backbone_config,
            norm_activation_config=norm_activation_config)

        network_config = network.get_config()
        factory_network_config = factory_network.get_config()

        self.assertEqual(network_config, factory_network_config)

    @combinations.generate(combinations.combine(model_id=[38, 56, 104], ))
    def test_revnet_creation(self, model_id):
        """Test creation of RevNet models."""
        network = backbones.RevNet(model_id=model_id,
                                   norm_momentum=0.99,
                                   norm_epsilon=1e-5)

        backbone_config = backbones_cfg.Backbone(
            type='revnet', revnet=backbones_cfg.RevNet(model_id=model_id))
        norm_activation_config = common_cfg.NormActivation(norm_momentum=0.99,
                                                           norm_epsilon=1e-5,
                                                           use_sync_bn=False)

        factory_network = factory.build_backbone(
            input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
            backbone_config=backbone_config,
            norm_activation_config=norm_activation_config)

        network_config = network.get_config()
        factory_network_config = factory_network.get_config()

        self.assertEqual(network_config, factory_network_config)

    @combinations.generate(combinations.combine(model_type=['resnet_3d'], ))
    def test_resnet_3d_creation(self, model_type):
        """Test creation of ResNet 3D models."""
        backbone_cfg = backbones_3d_cfg.Backbone3D(type=model_type).get()
        temporal_strides = []
        temporal_kernel_sizes = []
        for block_spec in backbone_cfg.block_specs:
            temporal_strides.append(block_spec.temporal_strides)
            temporal_kernel_sizes.append(block_spec.temporal_kernel_sizes)

        _ = backbones.ResNet3D(model_id=backbone_cfg.model_id,
                               temporal_strides=temporal_strides,
                               temporal_kernel_sizes=temporal_kernel_sizes,
                               norm_momentum=0.99,
                               norm_epsilon=1e-5)
Esempio n. 56
0
class CollectiveOpsTest(test.TestCase, parameterized.TestCase):

  def setUp(self):
    super().setUp()
    # Enabling collectives can be done in "setUpClass", but requires using
    # different collective_keys in different tests as collectives are reused
    # across tests. Always resetting collective ops before each test offers
    # better test isolation.
    global_mpr_1p.runner.run(enable_collective_ops)
    global_mpr_2p.runner.run(enable_collective_ops)

  def make_collective(self, num_processes, gpu_per_process, communication):
    """Returns collectives and other info to be used in tests.

    Args:
      num_processes: an integer indicating the number of processes that
        participate in the collective.
      gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
      communication: one of `CollectiveCommunication`.

    Returns:
     A tuple of (collective, devices, group_size) where collective is a instance
     of `CollectiveAllReduce`, devices are a list of local devices (str)
     attached to the current process, and group_size is the group_size of
     collective.
    """

    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
    devices = [
        "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id
    ]
    if gpu_per_process > 0:
      devices = [
          "/job:worker/replica:0/task:%d/device:GPU:%d" %
          (cluster_resolver.task_id, i) for i in range(gpu_per_process)
      ]
    group_size = num_processes * len(devices)
    collective = cross_device_ops_lib.CollectiveAllReduce(
        devices=devices, group_size=group_size, communication=communication)
    return collective, devices, cluster_resolver.task_id

  def as_list(self, value):
    """An utility to convert a `Mirrored`, `Tensor` or `IndexedSlices` to a list.

    The reason it exists is to provide a uniformed view of returned value of
    "reduce" calls, especially across tf.function boundaries. Returning
    `Mirrored` from a tf.function will only evaluate the primary value, which
    makes collective ops of non-primary device being pruned, and will eventually
    cause hanging.

    Args:
      value: the value to convert, can be one of `Mirrored`, `Tensor` and
        `IndexedSlices`.

    Returns:
      A list of `Tensor` or `IndexedSlices`.
    """
    if isinstance(value, ops.Tensor):
      return [value]
    elif isinstance(value, indexed_slices.IndexedSlices):
      return [value]
    elif isinstance(value, value_lib.Mirrored):
      return value.values
    else:
      raise ValueError("unwrap: unsupported input type: %s" % type(value))

  RunOptions = collections.namedtuple(  # pylint: disable=invalid-name
      "RunOptions",
      [
          "mode",  # A list of str from ["eager", "func_graph"]
          "num_processes",
          "gpus_per_process",
          "reduce_op",
          "communication",
      ])
  RunOptions.__new__.__defaults__ = (["eager", "func_graph"], 2, 0,
                                     ReduceOp.SUM, CollectiveCommunication.AUTO)

  def reduce_and_verify(self, inputs, expect, options):
    """Reduce the given `inputs` and verify the output matches `expect`.

    Args:
      inputs: a list of `Tensor` or `IndexedSlices`, where i-th value will be
        fed to i-th replica.
      expect: a `Tensor` or `IndexedSlices`. This should be the expected value
        for one replica.
      options: a `RunOpotions` instance.
    """

    def replica_fn():
      collective, devices, pid = self.make_collective(options.num_processes,
                                                      options.gpus_per_process,
                                                      options.communication)

      def reduce_fn():
        value_fn = lambda device_idx: inputs[pid * len(devices) + device_idx]
        per_replica_value = make_per_replica_value(value_fn, devices)
        reduced_values = collective.reduce(options.reduce_op, per_replica_value,
                                           per_replica_value)
        reduced_values = self.as_list(reduced_values)
        self.assertAllEqual(devices, [v.device for v in reduced_values])
        return [ops.convert_to_tensor(v) for v in reduced_values]

      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)

      if "eager" in options.mode:
        got = reduce_fn()
        self.assertAllClose(got, per_replica_expect)

      if "func_graph" in options.mode:
        got = def_function.function(reduce_fn)()
        self.assertAllClose(got, per_replica_expect)

    get_global_mpr(options.num_processes).run(replica_fn)

  def batch_reduce_and_verify(self, inputs, expect, options):
    """Batch reduce the given `inputs` and verify the output matches `expect`.

    Args:
      inputs: a 2-level nested list of `Tensor` or `IndexedSlices`, where i-th
        value will be fed to i-th replica.
      expect: a list of `Tensor` or `IndexedSlices`. This should be the expected
        value for one replica.
      options: a `RunOpotions` instance.
    """

    def replica_fn():
      collective, devices, pid = self.make_collective(options.num_processes,
                                                      options.gpus_per_process,
                                                      options.communication)

      def batch_reduce_fn():
        batch_size = len(inputs[0])
        value_dst_pairs = []
        for i in range(batch_size):

          def value_fn(device_idx, idx=i):
            return inputs[pid * len(devices) + device_idx][idx]

          per_replica_value = make_per_replica_value(value_fn, devices)
          value_dst_pairs.append((per_replica_value, per_replica_value))
        reduced_values = collective.batch_reduce(options.reduce_op,
                                                 value_dst_pairs)
        reduced_values = [self.as_list(v) for v in reduced_values]
        for v in reduced_values:
          self.assertAllEqual(devices, [t.device for t in v])
        return nest.map_structure(ops.convert_to_tensor, reduced_values)

      per_replica_expect = nest.map_structure(
          lambda x: [ops.convert_to_tensor(x)] * len(devices), expect)

      if "eager" in options.mode:
        got = batch_reduce_fn()
        self.assertAllClose(got, per_replica_expect)

      if "func_graph" in options.mode:
        got = def_function.function(batch_reduce_fn)()
        self.assertAllClose(got, per_replica_expect)

    get_global_mpr(options.num_processes).run(replica_fn)

  @combinations.generate(
      combinations.combine(
          num_processes=[1, 2],
          required_gpus=[0, 1, 2],
          communication=[
              # NCCL is only used for batch reduce, so we are not including
              # NCCL combination here.
              CollectiveCommunication.AUTO,
              CollectiveCommunication.RING
          ],
          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
  def testAllReduceDense(self, num_processes, required_gpus, communication,
                         reduce_op):
    options = self.RunOptions(
        num_processes=num_processes,
        gpus_per_process=required_gpus,
        reduce_op=reduce_op,
        communication=communication)
    group_size = options.num_processes * (options.gpus_per_process or 1)

    inputs_data = [1.0, 2.0, 3.0, 4.0]
    inputs = inputs_data[0:group_size]

    if group_size == 1:
      expect = 1.0
    if group_size == 2:
      expect = 3.0 if reduce_op == ReduceOp.SUM else 1.5
    elif group_size == 4:
      expect = 10.0 if reduce_op == ReduceOp.SUM else 2.5

    self.reduce_and_verify(inputs, expect, options)

  @combinations.generate(
      combinations.combine(
          num_processes=[1, 2],
          required_gpus=[0, 1, 2],
          communication=[
              # NCCL is only used for batch reduce, so we are not including
              # NCCL combination here.
              CollectiveCommunication.AUTO,
              CollectiveCommunication.RING
          ],
          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
          reduce_op=ReduceOp.SUM))
  def testAllReduceSparse(self, num_processes, required_gpus, communication,
                          reduce_op):
    options = self.RunOptions(
        mode=["func_graph"],  # Sparse reduce is not supported in eager.
        num_processes=num_processes,
        gpus_per_process=required_gpus,
        reduce_op=reduce_op,
        communication=communication)
    group_size = options.num_processes * (options.gpus_per_process or 1)

    inputs_data = [
        indexed_slices.IndexedSlicesValue(
            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[3.], [4.]], indices=[1, 2], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[5.], [6.]], indices=[7, 8], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[7.], [8.]], indices=[3, 2], dense_shape=[10, 1]),
    ]
    inputs = inputs_data[0:group_size]

    if group_size == 1:
      expect = indexed_slices.IndexedSlices(
          values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1])
    elif group_size == 2:
      expect = indexed_slices.IndexedSlices(
          values=[[1.], [2.], [3.], [4.]],
          indices=[0, 1, 1, 2],
          dense_shape=[10, 1])
    elif group_size == 4:
      expect = indexed_slices.IndexedSlices(
          values=[[1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.]],
          indices=[0, 1, 1, 2, 7, 8, 3, 2],
          dense_shape=[10, 1])

    self.reduce_and_verify(inputs, expect, options)

  def testAllReduceSparseVariableLength(self):
    # One device per process, 2 processes, 2 replicas in total.
    inputs = [
        indexed_slices.IndexedSlicesValue(
            values=[[1.]], indices=[0], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[2.], [3.], [4.]], indices=[0, 1, 2], dense_shape=[10, 1]),
    ]
    expect = indexed_slices.IndexedSlices(
        values=[[1.], [2.], [3.], [4.]],
        indices=[0, 0, 1, 2],
        dense_shape=[10, 1])
    self.reduce_and_verify(
        inputs,
        expect,
        self.RunOptions(
            mode=["func_graph"],  # Sparse reduce is not supported in eager.
            num_processes=2,
            reduce_op=ReduceOp.SUM))

  @combinations.generate(
      combinations.combine(
          num_processes=[1, 2],
          required_gpus=[0, 1, 2],
          communication=[
              CollectiveCommunication.AUTO, CollectiveCommunication.RING,
              CollectiveCommunication.NCCL
          ],
          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
  def testBatchAllReduceDense(self, num_processes, required_gpus, communication,
                              reduce_op):
    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
      self.skipTest("Skip CPU + NCCL combination")
    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
                    "physical GPUs for every process.")

    options = self.RunOptions(
        num_processes=num_processes,
        gpus_per_process=required_gpus,
        reduce_op=reduce_op,
        communication=communication)
    group_size = options.num_processes * (options.gpus_per_process or 1)

    inputs_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
    inputs = inputs_data[0:group_size]

    if group_size == 1:
      expect = [1.0, 2.0]
    if group_size == 2:
      expect = [4.0, 6.0] if reduce_op == ReduceOp.SUM else [2.0, 3.0]
    elif group_size == 4:
      expect = [16.0, 20.0] if reduce_op == ReduceOp.SUM else [4.0, 5.0]

    self.batch_reduce_and_verify(inputs, expect, options)

  @combinations.generate(
      combinations.combine(
          num_processes=[1, 2],
          required_gpus=[0, 1, 2],
          communication=[
              CollectiveCommunication.AUTO,
              CollectiveCommunication.RING,
              CollectiveCommunication.NCCL,
          ],
          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
          reduce_op=ReduceOp.SUM))
  def testBatchAllReduceSparse(self, num_processes, required_gpus,
                               communication, reduce_op):
    if required_gpus == 0 and communication == CollectiveCommunication.NCCL:
      self.skipTest("Skip CPU + NCCL combination")
    if num_processes == 2 and communication == CollectiveCommunication.NCCL:
      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
                    "physical GPUs for every process.")

    options = self.RunOptions(
        mode=["func_graph"],  # Sparse reduce is not supported in eager.
        num_processes=num_processes,
        gpus_per_process=required_gpus,
        reduce_op=reduce_op,
        communication=communication)
    group_size = options.num_processes * (options.gpus_per_process or 1)

    inputs_data = ([
        indexed_slices.IndexedSlicesValue(
            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
    ], [
        indexed_slices.IndexedSlicesValue(
            values=[[5.], [6.]], indices=[1, 2], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[7.], [8.]], indices=[0, 1], dense_shape=[5, 1])
    ], [
        indexed_slices.IndexedSlicesValue(
            values=[[9.], [10.]], indices=[3, 4], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[11.], [12.]], indices=[3, 4], dense_shape=[5, 1])
    ], [
        indexed_slices.IndexedSlicesValue(
            values=[[13.], [14.]], indices=[8, 9], dense_shape=[10, 1]),
        indexed_slices.IndexedSlicesValue(
            values=[[15.], [16.]], indices=[3, 4], dense_shape=[5, 1])
    ])
    inputs = inputs_data[0:group_size]

    if group_size == 1:
      expect = [
          indexed_slices.IndexedSlices(
              values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
          indexed_slices.IndexedSlicesValue(
              values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
      ]
    if group_size == 2:
      expect = [
          indexed_slices.IndexedSlices(
              values=[[1.], [2.], [5.], [6.]],
              indices=[0, 1, 1, 2],
              dense_shape=[10, 1]),
          indexed_slices.IndexedSlices(
              values=[[3.], [4.], [7.], [8.]],
              indices=[1, 2, 3, 4],
              dense_shape=[5, 1])
      ]
    elif group_size == 4:
      expect = [
          indexed_slices.IndexedSlices(
              values=[[1.], [2.], [5.], [6.], [9.], [10.], [13.], [14.]],
              indices=[0, 1, 1, 2, 3, 4, 8, 9],
              dense_shape=[10, 1]),
          indexed_slices.IndexedSlices(
              values=[[3.], [4.], [7.], [8.], [11.], [12.], [15.], [16.]],
              indices=[1, 2, 0, 1, 3, 4, 3, 4],
              dense_shape=[5, 2])
      ]
      self.batch_reduce_and_verify(inputs, expect, options)

  @combinations.generate(
      combinations.combine(
          num_processes=[1, 2],
          required_gpus=[0, 1, 2],
          axis=[0, 1, 2],
          func_mode=["eager", "func_graph"],
          communication=[
              CollectiveCommunication.NCCL,
              CollectiveCommunication.AUTO,
              CollectiveCommunication.RING
          ]))
  def testAllGatherSameShape(self, num_processes, required_gpus, communication,
                             func_mode, axis):

    def replica_fn():
      collective, devices, _ = self.make_collective(num_processes,
                                                    required_gpus,
                                                    communication)
      value = constant_op.constant([[[1, 2], [1, 2]]], dtype=dtypes.float32)

      def gather_fn():
        value_fn = lambda device_idx: value
        per_replica_value = make_per_replica_value(value_fn, devices)
        gathered_values = collective._gather(
            per_replica_value, per_replica_value, axis=axis)
        gathered_values = self.as_list(gathered_values)
        self.assertAllEqual(devices, [v.device for v in gathered_values])
        return [ops.convert_to_tensor(v) for v in gathered_values]

      group_size = num_processes * (required_gpus or 1)
      expect = array_ops.concat([value] * group_size, axis=axis)
      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)

      if func_mode == "eager":
        result = gather_fn()
        self.assertAllClose(result, per_replica_expect)

      if func_mode == "func_graph":
        result = def_function.function(gather_fn)()
        self.assertAllClose(result, per_replica_expect)

    get_global_mpr(num_processes).run(replica_fn)
def all_strategy_and_input_config_combinations():
  return (combinations.times(
      combinations.combine(
          distribution=all_strategies, cloning=[True, False]),
      eager_mode_test_configuration() + graph_mode_test_configuration()))
Esempio n. 58
0
      _, replica_local = _make_replica_local(
          variable_scope.VariableAggregation.SUM)
      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
      self.assertIsInstance(converted, ops.Tensor)
      self.assertEqual(converted.dtype, replica_local.dtype)

      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
      # Resources variable are converted to tensors as well when as_ref is True.
      self.assertIsInstance(converted, ops.Tensor)
      self.assertEqual(converted.dtype, replica_local.dtype)


@combinations.generate(
    combinations.combine(
        distribution=[
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
        ],
        mode=["graph", "eager"]))
class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):

  def _assign_replica_local(self, devices, v, new):
    for d, var, n in zip(devices, v, new):
      with ops.device(d):
        self.evaluate(var.assign(n))

  def _save_return_saver(self, sess, var):
    saver = saver_lib.Saver(var_list=[var])
    test_dir = self.get_temp_dir()
    prefix = os.path.join(test_dir, "ckpt")
    return saver.save(sess, prefix), saver
class KerasModelsTest(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_single_keras_layer_experimental_run(self, distribution):
        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = keras.layers.Dense(4, name="dense")

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                return grads

            outputs = distribution.experimental_run_v2(step_fn,
                                                       args=(next(iterator), ))
            return nest.map_structure(distribution.experimental_local_results,
                                      outputs)

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_keras_model_creation_experimental_run(self, distribution):
        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = self._get_model()

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                return grads

            outputs = distribution.experimental_run_v2(step_fn,
                                                       args=(next(iterator), ))
            return nest.map_structure(distribution.experimental_local_results,
                                      outputs)

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_keras_model_optimizer_experimental_run(self, distribution):
        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = self._get_model()
            optimizer = keras.optimizer_v2.rmsprop.RMSprop()

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))
                return loss

            outputs = distribution.experimental_run_v2(step_fn,
                                                       args=(next(iterator), ))
            return nest.map_structure(distribution.experimental_local_results,
                                      outputs)

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_keras_subclass_model_optimizer_experimental_run(
            self, distribution):
        def get_subclass_model():
            class KerasSubclassModel(keras.Model):
                def __init__(self):
                    super(KerasSubclassModel, self).__init__()
                    self.l = keras.layers.Dense(4, name="dense")

                def call(self, x):
                    return self.l(x)

            return KerasSubclassModel()

        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = get_subclass_model()
            optimizer = keras.optimizer_v2.rmsprop.RMSprop()

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))
                return loss

            outputs = distribution.experimental_run_v2(step_fn,
                                                       args=(next(iterator), ))
            return nest.map_structure(distribution.experimental_local_results,
                                      outputs)

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_keras_model_optimizer_experimental_run_loop(self, distribution):
        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = self._get_model()
            optimizer = keras.optimizer_v2.rmsprop.RMSprop()

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))
                return loss

            for _ in range(5):
                distribution.experimental_run_v2(step_fn,
                                                 args=(next(iterator), ))

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_lstm(self, distribution):

        batch_size = 32

        def create_lstm_model():
            model = keras.models.Sequential()
            # We only have LSTM variables so we can detect no gradient issues more
            # easily.
            model.add(
                keras.layers.LSTM(1,
                                  return_sequences=False,
                                  input_shape=(10, 1)))
            return model

        def create_lstm_data():
            seq_length = 10

            x_train = np.random.rand(batch_size, seq_length,
                                     1).astype("float32")
            y_train = np.random.rand(batch_size, 1).astype("float32")
            return x_train, y_train

        x, y = create_lstm_data()
        dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
        dataset = dataset.batch(batch_size, drop_remainder=True)
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = create_lstm_model()
            optimizer = keras.optimizer_v2.gradient_descent.SGD()

        @def_function.function
        def train_step(input_iterator):
            def step_fn(inputs):
                inps, targ = inputs
                with backprop.GradientTape() as tape:
                    output = model(inps)
                    loss = math_ops.reduce_mean(
                        keras.losses.binary_crossentropy(y_true=targ,
                                                         y_pred=output,
                                                         from_logits=False))
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))
                return loss

            outputs = distribution.experimental_run_v2(
                step_fn, args=(next(input_iterator), ))
            return distribution.experimental_local_results(outputs)

        train_step(input_iterator)

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_nested_tf_functions(self, distribution):
        # The test builds two computations with keras layers, one with nested
        # tf.function, and the other without nested tf.function. We run these
        # computations independently on the model with same weights, and make sure
        # the variables are still the same after one training step.

        inputs = np.random.random((10, 3)).astype(np.float32)
        targets = np.ones((10, 4), dtype=np.float32)
        dataset = dataset_ops.Dataset.from_tensor_slices(
            (inputs, targets)).repeat()
        dataset = dataset.batch(10, drop_remainder=True)
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        def get_model():
            x = keras.layers.Input(shape=(3, ), name="input")
            y = keras.layers.Dense(4, name="dense")(x)
            model = keras.Model(x, y)
            return model

        with distribution.scope():
            model = get_model()
            optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1,
                                                                momentum=0.01)
            weights_file = os.path.join(self.get_temp_dir(), ".h5")
            model.save_weights(weights_file)
            model2 = get_model()
            model2.load_weights(weights_file)

        # Make sure model and model2 variables are in sync when initialized.
        for model_v, model2_v in zip(model.variables, model2.variables):
            self.assertAllClose(model_v.numpy(), model2_v.numpy())

        def compute_loss(images, targets):
            outputs = model(images)
            return math_ops.reduce_sum(outputs - targets)

        @def_function.function
        def train_step_without_nested_tf_function(inputs):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    loss = compute_loss(images, targets)
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(zip(grads, model.variables))

            distribution.experimental_run_v2(step_fn, args=(inputs, ))

        @def_function.function
        def compute_loss2(images, targets):
            outputs = model2(images)
            return math_ops.reduce_sum(outputs - targets)

        @def_function.function
        def train_step_with_nested_tf_function(inputs):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    loss = compute_loss2(images, targets)
                grads = tape.gradient(loss, model2.variables)
                optimizer.apply_gradients(zip(grads, model2.variables))

            distribution.experimental_run_v2(step_fn, args=(inputs, ))

        inputs = next(input_iterator)

        train_step_without_nested_tf_function(inputs)
        train_step_with_nested_tf_function(inputs)

        # Make sure model and model2 variables are still in sync.
        for model_v, model2_v in zip(model.variables, model2.variables):
            self.assertAllClose(model_v.numpy(), model2_v.numpy())

    @combinations.generate(
        combinations.combine(distribution=strategy_combinations.all_strategies,
                             mode=["eager"]))
    def test_customized_tf_module_experimental_run(self, distribution):
        dataset = self._get_dataset()
        input_iterator = iter(
            distribution.experimental_distribute_dataset(dataset))

        with distribution.scope():
            model = CustomModel()

        @def_function.function
        def train_step(iterator):
            def step_fn(inputs):
                images, targets = inputs
                with backprop.GradientTape() as tape:
                    outputs = model(images)
                    loss = math_ops.reduce_sum(outputs - targets)
                grads = tape.gradient(loss, model.variables)
                return grads

            outputs = distribution.experimental_run_v2(step_fn,
                                                       args=(next(iterator), ))
            return nest.map_structure(distribution.experimental_local_results,
                                      outputs)

        train_step(input_iterator)

    def _get_dataset(self):
        inputs = np.zeros((10, 3), dtype=np.float32)
        targets = np.zeros((10, 4), dtype=np.float32)
        dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
        dataset = dataset.repeat(100)
        dataset = dataset.batch(10, drop_remainder=True)
        return dataset

    def _get_model(self):
        x = keras.layers.Input(shape=(3, ), name="input")
        y = keras.layers.Dense(4, name="dense")(x)
        model = keras.Model(x, y)
        return model
Esempio n. 60
0
class DistributedValuesTest(test.TestCase, parameterized.TestCase):
    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueFromTensor(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        single_value = constant_op.constant(1)

        def value_fn(ctx):
            del ctx
            return single_value

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        self.assertAllEqual(
            ds_test_util.gather(distribution, distributed_values),
            constant_op.constant(1.,
                                 shape=(distribution.num_replicas_in_sync)))

    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueSingleNumpyArrayConstant(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        array_value = np.array([1., 2., 3.])

        def value_fn(ctx):
            del ctx
            return array_value

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        self.assertAllEqual(
            ds_test_util.gather(distribution, distributed_values).numpy(),
            [[1., 2., 3.]] * distribution.num_replicas_in_sync)

    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueTupleConstant(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        tuple_value = (1., 2., 3.)

        def value_fn(ctx):
            del ctx
            return tuple_value

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        distributed_values = ds_test_util.gather(distribution,
                                                 distributed_values)

        # Expected output for 2 replicas:
        # ([1.0, 1.0], [2.0, 2.0], [3.0, 3.0])
        expected = tuple([v for i in range(distribution.num_replicas_in_sync)]
                         for v in tuple_value)
        self.assertAllEqual(distributed_values, expected)

    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueNestedStructurePerReplica(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        tuple_value = (1., 2., 3.)

        def value_fn(ctx):
            per_replica = []
            for val in tuple_value:
                per_replica.append(val * ctx.replica_id_in_sync_group)
            return tuple(per_replica)

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        distributed_values = ds_test_util.gather(distribution,
                                                 distributed_values)

        # Expected output for 2 replicas:
        # ([0.0, 1.0], [0.0, 2.0], [0.0, 3.0])
        expected = tuple(
            [v * i for i in range(distribution.num_replicas_in_sync)]
            for v in tuple_value)
        self.assertAllEqual(distributed_values, expected)

    # NOTE(priyag): Cannot test this with MultiWorkerMirroredStrategy because
    # collective ops do not support SparseTensors.
    @combinations.generate(
        combinations.combine(
            distribution=strategy_combinations.all_strategies_minus_default,
            mode=["eager"]))
    def testMakeDistributedValueSpareTensor(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")

        def value_fn(ctx):
            del ctx
            return sparse_tensor.SparseTensor(indices=[[0, 0], [1, 2]],
                                              values=[1, 2],
                                              dense_shape=[3, 4])

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        local_results = distribution.experimental_local_results(
            distributed_values)
        for i in range(distribution.num_replicas_in_sync):
            self.assertAllEqual(
                sparse_ops.sparse_tensor_to_dense(local_results[i]),
                [[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]])

    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueExtractFromArray(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        multiple_values = range(distribution.num_replicas_in_sync)

        def value_fn(ctx):
            return multiple_values[ctx.replica_id_in_sync_group]

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        distributed_values = ds_test_util.gather(distribution,
                                                 distributed_values)
        expected = range(distribution.num_replicas_in_sync)
        self.assertAllEqual(distributed_values, expected)

    @combinations.generate(
        combinations.combine(
            distribution=(strategy_combinations.all_strategies_minus_default +
                          strategy_combinations.multiworker_strategies),
            mode=["eager"]))
    def testMakeDistributedValueAndRun(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")

        @def_function.function
        def run():
            multiple_values = range(distribution.num_replicas_in_sync)

            def value_fn(ctx):
                return multiple_values[ctx.replica_id_in_sync_group]

            distributed_values = (
                distribution.experimental_distribute_values_from_function(
                    value_fn))

            def computation(x):
                return math_ops.square(x)

            outputs = ds_test_util.gather(
                distribution,
                distribution.run(computation, args=(distributed_values, )))
            return outputs

        results = run()

        expected = [i**2 for i in range(distribution.num_replicas_in_sync)]
        self.assertAllEqual(results, expected)

    @combinations.generate(
        combinations.combine(distribution=[
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.
            mirrored_strategy_with_two_gpus_no_merge_call,
            strategy_combinations.tpu_strategy,
            strategy_combinations.tpu_strategy_packed_var,
            strategy_combinations.central_storage_strategy_with_two_gpus,
        ] + strategy_combinations.multiworker_strategies,
                             mode=["eager"]))
    def testMakeDistributedValueDefaultDevicePlacement(self, distribution):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")

        def value_fn(ctx):
            del ctx
            return constant_op.constant(1.0)

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        default_device = array_ops.identity(constant_op.constant(1.0)).device
        for i in range(len(distribution.extended.worker_devices)):
            self.assertAllEqual(distributed_values._values[i].device,
                                default_device)

    @combinations.generate(
        combinations.combine(
            distribution=[
                strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
                strategy_combinations.
                mirrored_strategy_with_two_gpus_no_merge_call,
                strategy_combinations.tpu_strategy,
                strategy_combinations.tpu_strategy_packed_var,
                strategy_combinations.central_storage_strategy_with_two_gpus,
            ] + strategy_combinations.multiworker_strategies,
            mode=["eager"],
            op_type=[constant_op.constant, array_ops.identity]))
    def testMakeDistributedValueExplicitDevicePlacement(
            self, distribution, op_type):
        if not tf2.enabled():
            self.skipTest("Only V2 is supported.")
        worker_devices = distribution.extended.worker_devices

        def value_fn(ctx):
            # In multi client setup, worker_devices is just the devices on that
            # worker.
            worker_device_id = ctx.replica_id_in_sync_group % len(
                worker_devices)
            with ops.device(worker_devices[worker_device_id]):
                return op_type(1.0)

        distributed_values = (
            distribution.experimental_distribute_values_from_function(value_fn)
        )
        for i in range(len(distribution.extended.worker_devices)):
            self.assertAllEqual(distributed_values._values[i].device,
                                worker_devices[i])