Example #1
0
    def _configure(self,
                   session_config=None,
                   cluster_spec=None,
                   task_type=None,
                   task_id=None):
        del task_type, task_id

        if session_config:
            session_config.isolate_session_state = True

        if cluster_spec:
            self._initialize_multi_worker(self._num_gpus, cluster_spec)

        if self._cross_device_ops is None:
            if self._cluster_spec:
                # It currently cannot detect the toplogy of remote workers. So we
                # hard-code the multi-worker all-reduce algorithm for now.
                if len(self._workers) == 1:
                    # The default is "nccl".
                    self._cross_device_ops = (
                        cross_device_ops_lib.AllReduceCrossDeviceOps())
                else:
                    # The default is hierarchical reduce and broadcast.
                    self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
                        self._workers, self._num_gpus)
            else:
                self._cross_device_ops = cross_device_ops_lib.choose_the_best(
                    self._devices, session_config=session_config)
Example #2
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    if isinstance(
        cross_device_ops._obj,  # pylint: disable=protected-access
        cross_device_ops_lib.AllReduceCrossDeviceOps
    ) and context.executing_eagerly():
      self.skipTest("b/149881884")
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `select_cross_device_ops` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)

  @combinations.generate(
      combinations.combine(
          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
          ],
          batch_reduce=[True, False],
          mode=["graph", "eager"]))
  def testReduceDistributedVariable(self, distribution,
                                    cross_device_ops_instance, batch_reduce):
    with distribution.scope():
      v = variables.Variable(1.)
    if batch_reduce:
      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                      [(v, v)])[0]
    else:
      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
    for v in result.values:
      self.assertIsInstance(v, ops.Tensor)
    self.evaluate(variables.global_variables_initializer())
    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
Example #3
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
          combinations.NamedObject(
              "HierarchicalCopyAggregateSmallTensors",
              cross_device_ops_lib.AllReduceCrossDeviceOps(
                  "hierarchical_copy", 0, 100, 10))
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `choose_the_best` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)
Example #4
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
    # TODO(yuefengz): decouple the num_gpus check from distribution in
    # combinations module so that we can pass in devices instead of a distribution
    # strategy.
    reduction_to_one_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "DefaultReductionToOneDevice",
                cross_device_ops_lib.ReductionToOneDevice()),
            combinations.NamedObject(
                "ReductionToCPUDeviceCrossDeviceOps",
                cross_device_ops_lib.ReductionToOneDevice(
                    reduce_to_device=_cpu_device)),
            combinations.NamedObject(
                "AccumulateNCrossDeviceOp",
                cross_device_ops_lib.ReductionToOneDevice(
                    accumulation_fn=math_ops.accumulate_n)),
        ],
        distribution=[
            strategy_combinations.one_device_strategy,
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])
    allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "AllReduce",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
            combinations.NamedObject(
                "AllReduceNoGradientRepacking",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
            combinations.NamedObject("NcclAllReduce",
                                     cross_device_ops_lib.NcclAllReduce()),
            combinations.NamedObject(
                "HierarchicalCopy",
                cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
            combinations.NamedObject(
                "HierarchicalCopyAggregateSmallTensors",
                cross_device_ops_lib.AllReduceCrossDeviceOps(
                    "hierarchical_copy", 0, 100, 10))
        ],
        distribution=[
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])

    @combinations.generate(reduction_to_one_combinations +
                           allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)

    def testChooseAlgorithm(self):
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                        [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if there are only 4 devices
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

        # if devices links contain each device itself
        device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                        [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                        [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if not dgx1-like links
        device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                        [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

    @combinations.generate(
        combinations.combine(mode=["graph", "eager"], required_gpus=1))
    def testSimpleReduceWithIndexedSlices(self):
        devices = ["/cpu:0", "/gpu:0"]
        t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2],
                                  devices[1])
        per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices),
                                           (t0, t1))
        result = cross_device_ops_lib._simple_reduce(per_replica, devices[0],
                                                     math_ops.add_n,
                                                     reduce_util.ReduceOp.SUM)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices with and without duplicate indices.
        total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]],
                                               [1, 1, 3], [5, 2], devices[0])
        total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3],
                                                  [5, 2], devices[0])
        self._assert_indexed_slices_equal(total_with_dups, result)
        self._assert_indexed_slices_equal(total_without_dups, result)

    @combinations.generate(
        combinations.combine(
            cross_device_ops_instance=[
                combinations.NamedObject(
                    "ReductionToOneDevice",
                    cross_device_ops_lib.ReductionToOneDevice()),
                combinations.NamedObject(
                    "AllReduceCrossDeviceOps",
                    cross_device_ops_lib.AllReduceCrossDeviceOps())
            ],
            reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
            batch_reduce=[True, False],
            mode=["graph", "eager"],
            required_gpus=1))
    def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                         reduce_op, batch_reduce)