class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                    CrossDeviceOpsTestBase):

    worker_devices = [
        "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
    ]
    multi_worker_allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "MultiWorkerAllReduce",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReducePack",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
            combinations.NamedObject(
                "MultiWorkerAllReduceAggregation",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
            combinations.NamedObject(
                "MultiWorkerAllReduceMultipleSpecs",
                cross_device_ops_lib.MultiWorkerAllReduce(
                    worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                        ("xring", 2, -1)], 0, 0, 0)),
        ],
        distribution=[
            combinations.NamedDistribution(
                "MirroredCPU",
                lambda: mirrored_strategy.MirroredStrategy(["/device:CPU:0"]),
                required_gpus=0),
            combinations.NamedDistribution(
                "Mirrored1GPU",
                lambda: mirrored_strategy.MirroredStrategy(["/device:GPU:0"]),
                required_gpus=1),
            combinations.NamedDistribution(
                "Mirrored2GPUs",
                # pylint: disable=g-long-lambda
                lambda: mirrored_strategy.MirroredStrategy(
                    ["/device:GPU:0", "/device:GPU:1"]),
                required_gpus=2),
        ],
        mode=["graph"])

    @combinations.generate(multi_worker_allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        distribution.configure(
            cluster_spec={
                "worker": [
                    "/job:worker/replica:0/task:0",
                    "/job:worker/replica:0/task:1"
                ]
            })
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)
Beispiel #2
0
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
                                    CrossDeviceOpsTestBase):

  worker_devices = [
      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
  ]
  multi_worker_allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "MultiWorkerAllReduce",
              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
                                                        ("pscpu/pscpu", 2, -1),
                                                        0, 0, 0)),
          combinations.NamedObject(
              "MultiWorkerAllReducePack",
              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
                                                        ("pscpu/pscpu", 2, -1),
                                                        1, 0, 0)),
          combinations.NamedObject(
              "MultiWorkerAllReduceAggregation",
              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
                                                        ("pscpu/pscpu", 2, -1),
                                                        0, 100, 10)),
          combinations.NamedObject(
              "MultiWorkerAllReduceMultipleSpecs",
              cross_device_ops_lib.MultiWorkerAllReduce(
                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                      ("xring", 2, -1)], 0, 0, 0)),
      ],
      devices=[
          [
              "/job:worker/replica:0/task:0/device:CPU:0",
              "/job:worker/replica:0/task:1/device:CPU:0"
          ],
          [
              "/job:worker/replica:0/task:0/device:GPU:0",
              "/job:worker/replica:0/task:1/device:GPU:0"
          ],
          [
              "/job:worker/replica:0/task:0/device:GPU:0",
              "/job:worker/replica:0/task:0/device:GPU:1",
              "/job:worker/replica:0/task:1/device:GPU:0",
              "/job:worker/replica:0/task:1/device:GPU:1"
          ],
      ],
      mode=["graph"])

  @combinations.generate(multi_worker_allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    # Mimic the default device of multi-worker strategies.
    with ops.device("/job:worker/replica:0/task:0"):
      self._testReductionAndBroadcast(cross_device_ops, devices)
# Should call set_virtual_cpus_to_at_least(3) in your test's setUp methods.
mirrored_strategy_with_cpu_1_and_2 = combinations.NamedDistribution(
    "Mirrored2CPU",
    lambda: mirrored_lib.MirroredStrategy(["/cpu:1", "/cpu:2"]))
central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
    "CentralStorage2GPUs",
    lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2),  # pylint: disable=protected-access
    required_gpus=2)
central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
    "CentralStorageCPUAndGPU",
    lambda: central_storage_strategy.CentralStorageStrategy(
        ["/gpu:0", "/cpu:0"]),
    required_gpus=1)

gradient_descent_optimizer_v1_fn = combinations.NamedObject(
    "GradientDescentV1",
    lambda: gradient_descent.GradientDescentOptimizer(0.2))
adagrad_optimizer_v1_fn = combinations.NamedObject(
    "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001))
adam_optimizer_v1_fn = combinations.NamedObject(
    "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1))
rmsprop_optimizer_v1_fn = combinations.NamedObject(
    "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001))

# TODO(shiningsun): consider adding the other v1 optimizers
optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn]

adadelta_optimizer_keras_v2_fn = combinations.NamedObject(
    "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001))
adagrad_optimizer_keras_v2_fn = combinations.NamedObject(
    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
Beispiel #4
0
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Strategy and optimizer combinations for combinations.combine()."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow.python.distribute import combinations
from tensorflow.python.keras.distribute import simple_models

simple_functional_model = combinations.NamedObject(
    "SimpleFunctionalModel", simple_models.SimpleFunctionalModel())

simple_sequential_model = combinations.NamedObject(
    "SimpleSequentialModel", simple_models.SimpleSequentialModel())

simple_subclass_model = combinations.NamedObject(
    "SimpleSubclassModel", simple_models.SimpleSubclassModel())

simple_tfmodule_model = combinations.NamedObject(
    "SimpleTFModuleModel", simple_models.SimpleTFModuleModel())
Beispiel #5
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    if isinstance(
        cross_device_ops._obj,  # pylint: disable=protected-access
        cross_device_ops_lib.AllReduceCrossDeviceOps
    ) and context.executing_eagerly():
      self.skipTest("b/149881884")
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `select_cross_device_ops` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)

  @combinations.generate(
      combinations.combine(
          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
          ],
          batch_reduce=[True, False],
          mode=["graph", "eager"]))
  def testReduceDistributedVariable(self, distribution,
                                    cross_device_ops_instance, batch_reduce):
    with distribution.scope():
      v = variables.Variable(1.)
    if batch_reduce:
      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                      [(v, v)])[0]
    else:
      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
    for v in result.values:
      self.assertIsInstance(v, ops.Tensor)
    self.evaluate(variables.global_variables_initializer())
    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
class OpCancellationTest(test.TestCase, parameterized.TestCase):
    def setUp(self):
        _setup_context()
        super().setUp()

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortIfNoCollective(self, collective_op, device,
                                          communication):
        # Do not abort if there's no active collective ops. There could be
        # exceptions like EOF which we expect users to catch, aborting collective
        # ops on all op errors intervenes with this workflow.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        dataset = dataset_ops.Dataset.from_tensors([1.])

        @def_function.function
        def collective_fn(in_tensor):
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        @def_function.function
        def f():
            iterator = iter(dataset)
            collective_fn(next(iterator))
            # This next(iterator) should raise EOF.
            collective_fn(next(iterator))

        with self.assertRaises(errors.OutOfRangeError):
            f()
        collective_fn(constant_op.constant([1.]))

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorAbortWithCollective(self, collective_op, device,
                                       communication):
        # Abort v1 collective ops if there're active collective ops at the time of
        # an op error. This is due to the inability to cancel collective ops, and op
        # errors may cause running collective ops to hang.
        dev0 = '/device:%s:0' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])
        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test abortion
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Now collective ops is aborted, subsequent collective ops should fail with
        # the previous error.
        with self.assertRaises(errors.CancelledError):
            with ops.device(dev0):
                collective_op(in_tensor,
                              group_size,
                              group_key,
                              instance_key,
                              communication_hint=communication)

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortWithCollective(self, collective_op, device,
                                          communication):
        # Do not abort v2 collective ops even if there're active collective ops at
        # the time of an op error. We rely cancellation to terminate active
        # collective ops.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])

        @def_function.function
        def collective_fn():
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        # Local params resolution cannot be cancelled yet, so we perform a normal
        # collective so that the group is resolved.
        collective_fn()

        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test cancellation
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Collective ops shouldn't be aborted and new collectives should be able to
        # proceed.
        collective_fn()

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testCancelDuringParamResolution(self, collective_op, device,
                                        communication):
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])
        t1_cancellation_manager = cancellation.CancellationManager()
        t2_cancellation_manager = cancellation.CancellationManager()

        @def_function.function
        def _collective_fn(x):
            # Run an assertion to crash one of the two function executions running
            # collectives. We explicitly cancel the other in response.
            assert_op = check_ops.assert_equal(x, in_tensor)
            with ops.control_dependencies([assert_op]):
                return collective_op(in_tensor,
                                     group_size,
                                     group_key,
                                     instance_key,
                                     communication_hint=communication)

        collective_concrete = _collective_fn.get_concrete_function(in_tensor)

        finish_mu = threading.Lock()
        finishes = 0

        def _placement_wrapper(device, x, my_cancellation, other_cancellation):
            try:
                with ops.device(device):
                    cancelable_collective = my_cancellation.get_cancelable_function(
                        collective_concrete)
                    return cancelable_collective(x)
            except errors.InvalidArgumentError:
                # `assert_equal` failed for this execution of the function. The other
                # function would deadlock without cancellation.
                other_cancellation.start_cancel()
            except errors.CancelledError:
                pass
            nonlocal finishes
            with finish_mu:
                finishes += 1

        t1 = threading.Thread(target=_placement_wrapper,
                              args=(dev0, constant_op.constant([1.]),
                                    t1_cancellation_manager,
                                    t2_cancellation_manager))
        t2 = threading.Thread(
            target=_placement_wrapper,
            # Will cause the assertion to fail
            args=(dev1, constant_op.constant([2.]), t2_cancellation_manager,
                  t1_cancellation_manager))
        t1.start()
        t2.start()
        t1.join()
        t2.join()
        self.assertEqual(finishes, 2)
        group_size = array_ops.identity(group_size)
        group_key = array_ops.identity(group_key)
        instance_key = array_ops.identity(instance_key)
        shape = array_ops.identity(shape)
        return _collective_ops.broadcast_recv_v2(shape, dtype, group_size,
                                                 group_key, instance_key,
                                                 *args, **kwargs)


device_combination = (
    combinations.combine(device='CPU', communication='RING', required_gpus=0) +
    combinations.combine(
        device='GPU', communication=['RING', 'NCCL'], required_gpus=2))

collective_op_combinations = combinations.combine(collective_op=[
    combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce),
    combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce),
    combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather),
    combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather)
])


@combinations.generate(
    combinations.times(
        combinations.combine(collective_ops=[
            combinations.NamedObject('v1', CollectiveOpsV1),
            combinations.NamedObject('v2', CollectiveOpsV2)
        ],
                             mode='eager'), device_combination))
class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
    def setUp(self):
Beispiel #8
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
          combinations.NamedObject(
              "HierarchicalCopyAggregateSmallTensors",
              cross_device_ops_lib.AllReduceCrossDeviceOps(
                  "hierarchical_copy", 0, 100, 10))
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `choose_the_best` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)
class OpCancellationTest(test.TestCase, parameterized.TestCase):
    def setUp(self):
        _setup_context()
        super().setUp()

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortIfNoCollective(self, collective_op, device,
                                          communication):
        # Do not abort if there's no active collective ops. There could be
        # exceptions like EOF which we expect users to catch, aborting collective
        # ops on all op errors intervenes with this workflow.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        dataset = dataset_ops.Dataset.from_tensors([1.])

        @def_function.function
        def collective_fn(in_tensor):
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        @def_function.function
        def f():
            iterator = iter(dataset)
            collective_fn(next(iterator))
            # This next(iterator) should raise EOF.
            collective_fn(next(iterator))

        with self.assertRaises(errors.OutOfRangeError):
            f()
        collective_fn(constant_op.constant([1.]))

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce',
                                         CollectiveOpsV1.all_reduce),
                combinations.NamedObject('all_gather',
                                         CollectiveOpsV1.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorAbortWithCollective(self, collective_op, device,
                                       communication):
        # Abort v1 collective ops if there're active collective ops at the time of
        # an op error. This is due to the inability to cancel collective ops, and op
        # errors may cause running collective ops to hang.
        dev0 = '/device:%s:0' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])
        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test abortion
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Now collective ops is aborted, subsequent collective ops should fail with
        # the previous error.
        with self.assertRaises(errors.CancelledError):
            with ops.device(dev0):
                collective_op(in_tensor,
                              group_size,
                              group_key,
                              instance_key,
                              communication_hint=communication)

    @combinations.generate(
        combinations.times(
            combinations.combine(collective_op=[
                combinations.NamedObject('all_reduce_v2',
                                         CollectiveOpsV2.all_reduce),
                combinations.NamedObject('all_gather_v2',
                                         CollectiveOpsV2.all_gather),
            ],
                                 mode='eager'), device_combination))
    def testOpErrorNotAbortWithCollective(self, collective_op, device,
                                          communication):
        # Do not abort v2 collective ops even if there're active collective ops at
        # the time of an op error. We rely cancellation to terminate active
        # collective ops.
        dev0 = '/device:%s:0' % device
        dev1 = '/device:%s:1' % device
        group_size = 2
        group_key = 100
        instance_key = 100
        in_tensor = constant_op.constant([1.])

        @def_function.function
        def collective_fn():
            for device in [dev0, dev1]:
                with ops.device(device):
                    collective_op(in_tensor,
                                  group_size,
                                  group_key,
                                  instance_key,
                                  communication_hint=communication)

        # Local params resolution cannot be cancelled yet, so we perform a normal
        # collective so that the group is resolved.
        collective_fn()

        # Make the dataset sleep a while so that the collective is being executed
        # when the EOF happens.
        dataset = dataset_ops.Dataset.from_tensors([1.]).apply(
            dataset_testing.sleep(sleep_microseconds=200))

        @def_function.function
        def f():
            # Launch a collective op that won't be able to finish to test cancellation
            # when other ops error.
            with ops.device(dev0):
                ret = collective_op(in_tensor,
                                    group_size,
                                    group_key,
                                    instance_key,
                                    communication_hint=communication)
            iterator = iter(dataset)
            next(iterator)
            # This should raise EOF.
            next(iterator)
            return ret

        with self.assertRaises(errors.OutOfRangeError):
            f()
        # Collective ops shouldn't be aborted and new collectives should be able to
        # proceed.
        collective_fn()
Beispiel #10
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
    # TODO(yuefengz): decouple the num_gpus check from distribution in
    # combinations module so that we can pass in devices instead of a distribution
    # strategy.
    reduction_to_one_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "DefaultReductionToOneDevice",
                cross_device_ops_lib.ReductionToOneDevice()),
            combinations.NamedObject(
                "ReductionToCPUDeviceCrossDeviceOps",
                cross_device_ops_lib.ReductionToOneDevice(
                    reduce_to_device=_cpu_device)),
            combinations.NamedObject(
                "AccumulateNCrossDeviceOp",
                cross_device_ops_lib.ReductionToOneDevice(
                    accumulation_fn=math_ops.accumulate_n)),
        ],
        distribution=[
            strategy_combinations.one_device_strategy,
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])
    allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "AllReduce",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
            combinations.NamedObject(
                "AllReduceNoGradientRepacking",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
            combinations.NamedObject("NcclAllReduce",
                                     cross_device_ops_lib.NcclAllReduce()),
            combinations.NamedObject(
                "HierarchicalCopy",
                cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
            combinations.NamedObject(
                "HierarchicalCopyAggregateSmallTensors",
                cross_device_ops_lib.AllReduceCrossDeviceOps(
                    "hierarchical_copy", 0, 100, 10))
        ],
        distribution=[
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])

    @combinations.generate(reduction_to_one_combinations +
                           allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)

    def testChooseAlgorithm(self):
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                        [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if there are only 4 devices
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

        # if devices links contain each device itself
        device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                        [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                        [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if not dgx1-like links
        device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                        [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

    @combinations.generate(
        combinations.combine(mode=["graph", "eager"], required_gpus=1))
    def testSimpleReduceWithIndexedSlices(self):
        devices = ["/cpu:0", "/gpu:0"]
        t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2],
                                  devices[1])
        per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices),
                                           (t0, t1))
        result = cross_device_ops_lib._simple_reduce(per_replica, devices[0],
                                                     math_ops.add_n,
                                                     reduce_util.ReduceOp.SUM)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices with and without duplicate indices.
        total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]],
                                               [1, 1, 3], [5, 2], devices[0])
        total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3],
                                                  [5, 2], devices[0])
        self._assert_indexed_slices_equal(total_with_dups, result)
        self._assert_indexed_slices_equal(total_without_dups, result)

    @combinations.generate(
        combinations.combine(
            cross_device_ops_instance=[
                combinations.NamedObject(
                    "ReductionToOneDevice",
                    cross_device_ops_lib.ReductionToOneDevice()),
                combinations.NamedObject(
                    "AllReduceCrossDeviceOps",
                    cross_device_ops_lib.AllReduceCrossDeviceOps())
            ],
            reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
            batch_reduce=[True, False],
            mode=["graph", "eager"],
            required_gpus=1))
    def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                         reduce_op, batch_reduce)
Beispiel #11
0
from tensorflow.python.eager import test
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import variables

mirrored_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
    "MirroredCPUAndGPU",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
    required_gpus=1)
mirrored_strategy_with_two_gpus = combinations.NamedDistribution(
    "Mirrored2GPUs",
    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
    required_gpus=2)

# pylint: disable=g-long-lambda
gradient_descent_optimizer_v2_fn = combinations.NamedObject(
    "GradientDescentV2",
    lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
adagrad_optimizer_v2_fn = combinations.NamedObject(
    "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))

optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]


def distributions_and_v2_optimizers():
    """DistributionStrategies and V2 Optimizers."""
    return combinations.combine(distribution=[
        strategy_combinations.one_device_strategy,
        strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
        strategy_combinations.mirrored_strategy_with_two_gpus,
    ],
                                optimizer_fn=optimizers_v2)