def _initialize_multi_worker(self, devices):
        """Initializes the object for multi-worker training."""
        self._local_mode = False

        assert devices, "Must specify at least one device."
        devices = tuple(device_util.resolve(d) for d in devices)
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument: %s" % devices)
        # TODO(josh11b): Require at least 2 devices?

        device_dict = _group_device_list(devices)
        workers = []
        worker_devices = []
        for job in ("chief", "worker"):
            for task in range(len(device_dict.get(job, []))):
                worker = "/job:%s/task:%d" % (job, task)
                workers.append(worker)
                worker_devices.append((worker, device_dict[job][task]))

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
        self._default_device = workers[0]
        self._host_input_device = numpy_dataset.SingleDevice(workers[0])

        self._device_map = values.ReplicaDeviceMap(devices)
        self._input_workers = input_lib.InputWorkers(self._device_map,
                                                     worker_devices)

        if len(workers) > 1:
            if not isinstance(self._cross_device_ops,
                              cross_device_ops_lib.MultiWorkerAllReduce):
                raise ValueError(
                    "In-graph multi-worker training with `MirroredStrategy` is not "
                    "supported.")
            self._inferred_cross_device_ops = self._cross_device_ops
        else:
            # TODO(yuefengz): make `choose_the_best` work with device strings
            # containing job names.
            self._inferred_cross_device_ops = cross_device_ops_lib.NcclAllReduce(
            )
Esempio n. 2
0
    def _initialize_multi_worker(self, devices):
        """Initializes the object for multi-worker training."""
        device_dict = _group_device_list(devices)
        workers = []
        worker_devices = []
        for job in ("chief", "worker"):
            for task in range(len(device_dict.get(job, []))):
                worker = "/job:%s/task:%d" % (job, task)
                workers.append(worker)
                worker_devices.append((worker, device_dict[job][task]))

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
        self._default_device = workers[0]
        self._host_input_device = numpy_dataset.SingleDevice(workers[0])

        self._devices = tuple(devices)
        self._input_workers_devices = worker_devices
        self._is_multi_worker_training = True

        if len(workers) > 1:
            # Grandfather usage in the legacy tests if they're configured properly.
            if (not isinstance(self._cross_device_ops,
                               cross_device_ops_lib.ReductionToOneDevice)
                    or self._cross_device_ops._num_between_graph_workers > 1):  # pylint: disable=protected-access
                raise ValueError(
                    "In-graph multi-worker training with `MirroredStrategy` is not "
                    "supported.")
            self._inferred_cross_device_ops = self._cross_device_ops
        else:
            # TODO(yuefengz): make `select_cross_device_ops` work with device strings
            # containing job names.
            self._inferred_cross_device_ops = cross_device_ops_lib.NcclAllReduce(
            )

        logging.info("Using MirroredStrategy with remote devices %r", devices)
Esempio n. 3
0
    def _initialize_multi_worker(self, devices):
        """Initializes the object for multi-worker training."""
        device_dict = _group_device_list(devices)
        workers = []
        worker_devices = []
        for job in ("chief", "worker"):
            for task in range(len(device_dict.get(job, []))):
                worker = "/job:%s/task:%d" % (job, task)
                workers.append(worker)
                worker_devices.append((worker, device_dict[job][task]))

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
        self._default_device = workers[0]
        self._host_input_device = numpy_dataset.SingleDevice(workers[0])

        self._device_map = values.ReplicaDeviceMap(devices)
        self._input_workers = input_lib.InputWorkers(self._device_map,
                                                     worker_devices)
        self._is_multi_worker_training = True

        if len(workers) > 1:
            if not isinstance(self._cross_device_ops,
                              cross_device_ops_lib.MultiWorkerAllReduce):
                raise ValueError(
                    "In-graph multi-worker training with `MirroredStrategy` is not "
                    "supported.")
            self._inferred_cross_device_ops = self._cross_device_ops
        else:
            # TODO(yuefengz): make `choose_the_best` work with device strings
            # containing job names.
            self._inferred_cross_device_ops = cross_device_ops_lib.NcclAllReduce(
            )

        logging.info("Using MirroredStrategy with remote devices %r", devices)
Esempio n. 4
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    if isinstance(
        cross_device_ops._obj,  # pylint: disable=protected-access
        cross_device_ops_lib.AllReduceCrossDeviceOps
    ) and context.executing_eagerly():
      self.skipTest("b/149881884")
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `select_cross_device_ops` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)

  @combinations.generate(
      combinations.combine(
          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
          ],
          batch_reduce=[True, False],
          mode=["graph", "eager"]))
  def testReduceDistributedVariable(self, distribution,
                                    cross_device_ops_instance, batch_reduce):
    with distribution.scope():
      v = variables.Variable(1.)
    if batch_reduce:
      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                      [(v, v)])[0]
    else:
      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
    for v in result.values:
      self.assertIsInstance(v, ops.Tensor)
    self.evaluate(variables.global_variables_initializer())
    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
Esempio n. 5
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
          combinations.NamedObject(
              "HierarchicalCopyAggregateSmallTensors",
              cross_device_ops_lib.AllReduceCrossDeviceOps(
                  "hierarchical_copy", 0, 100, 10))
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `choose_the_best` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)
Esempio n. 6
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
    # TODO(yuefengz): decouple the num_gpus check from distribution in
    # combinations module so that we can pass in devices instead of a distribution
    # strategy.
    reduction_to_one_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "DefaultReductionToOneDevice",
                cross_device_ops_lib.ReductionToOneDevice()),
            combinations.NamedObject(
                "ReductionToCPUDeviceCrossDeviceOps",
                cross_device_ops_lib.ReductionToOneDevice(
                    reduce_to_device=_cpu_device)),
            combinations.NamedObject(
                "AccumulateNCrossDeviceOp",
                cross_device_ops_lib.ReductionToOneDevice(
                    accumulation_fn=math_ops.accumulate_n)),
        ],
        distribution=[
            strategy_combinations.one_device_strategy,
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])
    allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "AllReduce",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
            combinations.NamedObject(
                "AllReduceNoGradientRepacking",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
            combinations.NamedObject("NcclAllReduce",
                                     cross_device_ops_lib.NcclAllReduce()),
            combinations.NamedObject(
                "HierarchicalCopy",
                cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
            combinations.NamedObject(
                "HierarchicalCopyAggregateSmallTensors",
                cross_device_ops_lib.AllReduceCrossDeviceOps(
                    "hierarchical_copy", 0, 100, 10))
        ],
        distribution=[
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])

    @combinations.generate(reduction_to_one_combinations +
                           allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)

    def testChooseAlgorithm(self):
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                        [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if there are only 4 devices
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

        # if devices links contain each device itself
        device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                        [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                        [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if not dgx1-like links
        device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                        [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

    @combinations.generate(
        combinations.combine(mode=["graph", "eager"], required_gpus=1))
    def testSimpleReduceWithIndexedSlices(self):
        devices = ["/cpu:0", "/gpu:0"]
        t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2],
                                  devices[1])
        per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices),
                                           (t0, t1))
        result = cross_device_ops_lib._simple_reduce(per_replica, devices[0],
                                                     math_ops.add_n,
                                                     reduce_util.ReduceOp.SUM)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices with and without duplicate indices.
        total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]],
                                               [1, 1, 3], [5, 2], devices[0])
        total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3],
                                                  [5, 2], devices[0])
        self._assert_indexed_slices_equal(total_with_dups, result)
        self._assert_indexed_slices_equal(total_without_dups, result)

    @combinations.generate(
        combinations.combine(
            cross_device_ops_instance=[
                combinations.NamedObject(
                    "ReductionToOneDevice",
                    cross_device_ops_lib.ReductionToOneDevice()),
                combinations.NamedObject(
                    "AllReduceCrossDeviceOps",
                    cross_device_ops_lib.AllReduceCrossDeviceOps())
            ],
            reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
            batch_reduce=[True, False],
            mode=["graph", "eager"],
            required_gpus=1))
    def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                         reduce_op, batch_reduce)