Beispiel #1
0
    def __init__(self, container_strategy, cluster_resolver,
                 variable_partitioner):
        """Initialization of ParameterServerStrategyV2Extended."""
        super(ParameterServerStrategyV2Extended,
              self).__init__(container_strategy)
        self._num_ps = len(cluster_resolver.cluster_spec().as_dict().get(
            "ps", []))
        self._num_workers = len(cluster_resolver.cluster_spec().as_dict().get(
            "worker", []))
        self._variable_count = 0

        self._variable_partitioner = variable_partitioner
        # The following two attrs are to verify that `ParameterServerStrategy`
        # methods are properly used with a `ClusterCoordinator`.
        self._used_with_coordinator = False
        self._being_scheduled = False
        self._set_num_gpus()
        distribute_lib.distribution_strategy_replica_gauge.get_cell(
            "num_gpus_per_worker").set(self._num_gpus_per_worker)

        # Don't canonicalize the devices here since this code is executed on Chief,
        # but we want the reduce evaluation to be done on each worker. Placer will
        # automatically choose the right device based on current context.
        # TODO(ishark): Use select_cross_device_ops instead.
        self._cross_device_ops = cross_device_ops_lib.ReductionToOneDevice(
            reduce_to_device="/device:CPU:0")
        self._cross_device_ops._canonicalize_devices = False  # pylint: disable=protected-access
        self._allow_run_without_coordinator = False
        self._coordinator_creation_lock = threading.Lock()
  def __init__(self,
               container_strategy,
               cluster_resolver=TFConfigClusterResolver()):
    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
    self._initialize_strategy(cluster_resolver)

    # We typically don't need to do all-reduce in this strategy.
    self._cross_device_ops = (
        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
  def _get_cross_device_ops(self, value):
    if self._collective_ops_in_use:
      if isinstance(value, values.DistributedValues):
        value_int32 = True in {
            dtypes.as_dtype(v.dtype) == dtypes.int32 for v in value.values
        }
      else:
        value_int32 = dtypes.as_dtype(value.dtype) == dtypes.int32
      if value_int32:
        return cross_device_ops_lib.ReductionToOneDevice()

    return self._cross_device_ops or self._inferred_cross_device_ops
Beispiel #4
0
    def __init__(self,
                 container_strategy,
                 cluster_resolver=None,
                 compute_devices=None,
                 parameter_device=None):
        super(ParameterServerStrategyExtended,
              self).__init__(container_strategy)
        self._initialize_strategy(cluster_resolver=cluster_resolver,
                                  compute_devices=compute_devices,
                                  parameter_device=parameter_device)

        # We typically don't need to do all-reduce in this strategy.
        self._cross_device_ops = (cross_device_ops_lib.ReductionToOneDevice(
            reduce_to_device=_LOCAL_CPU))
Beispiel #5
0
 def get_values(value):
   if not isinstance(value, values.DistributedValues):
     # This function handles reducing values that are not PerReplica or
     # Mirrored values. For example, the same value could be present on all
     # replicas in which case `value` would be a single value or value could
     # be 0.
     return cross_device_ops_lib.reduce_non_distributed_value(
         reduce_op, value, destinations, self._num_replicas_in_sync)
   if self._use_merge_call() and self._collective_ops_in_use and ((
       not cross_device_ops_lib._devices_match(value, destinations) or  # pylint: disable=protected-access
       any("cpu" in d.lower()
           for d in cross_device_ops_lib.get_devices_from(destinations)))):
     return cross_device_ops_lib.ReductionToOneDevice().reduce(
         reduce_op, value, destinations)
   return self._get_cross_device_ops(value).reduce(
       reduce_op,
       value,
       destinations=destinations,
       options=self._communication_options.merge(options))
Beispiel #6
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    if isinstance(
        cross_device_ops._obj,  # pylint: disable=protected-access
        cross_device_ops_lib.AllReduceCrossDeviceOps
    ) and context.executing_eagerly():
      self.skipTest("b/149881884")
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `select_cross_device_ops` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.select_cross_device_ops(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)

  @combinations.generate(
      combinations.combine(
          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
          ],
          batch_reduce=[True, False],
          mode=["graph", "eager"]))
  def testReduceDistributedVariable(self, distribution,
                                    cross_device_ops_instance, batch_reduce):
    with distribution.scope():
      v = variables.Variable(1.)
    if batch_reduce:
      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                      [(v, v)])[0]
    else:
      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
    for v in result.values:
      self.assertIsInstance(v, ops.Tensor)
    self.evaluate(variables.global_variables_initializer())
    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
Beispiel #7
0
 def _make_cross_device_ops(self, num_gpus_per_worker):
     return cross_device_ops_lib.ReductionToOneDevice()
Beispiel #8
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):

  reduction_to_one_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject("DefaultReductionToOneDevice",
                                   cross_device_ops_lib.ReductionToOneDevice()),
          combinations.NamedObject(
              "ReductionToCPUDeviceCrossDeviceOps",
              cross_device_ops_lib.ReductionToOneDevice(
                  reduce_to_device=_cpu_device)),
          combinations.NamedObject(
              "AccumulateNCrossDeviceOp",
              cross_device_ops_lib.ReductionToOneDevice(
                  accumulation_fn=math_ops.add_n)),
      ],
      devices=[
          ["/cpu:0"],
          ["/cpu:0", "/gpu:0"],
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])
  allreduce_combinations = combinations.combine(
      cross_device_ops=[
          combinations.NamedObject(
              "AllReduce",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
          combinations.NamedObject(
              "AllReduceNoGradientRepacking",
              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
          combinations.NamedObject("NcclAllReduce",
                                   cross_device_ops_lib.NcclAllReduce()),
          combinations.NamedObject(
              "HierarchicalCopy",
              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
          combinations.NamedObject(
              "HierarchicalCopyAggregateSmallTensors",
              cross_device_ops_lib.AllReduceCrossDeviceOps(
                  "hierarchical_copy", 0, 100, 10))
      ],
      devices=[
          ["/gpu:0", "/gpu:1"],
      ],
      mode=["graph", "eager"])

  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
  def testReductionAndBroadcast(self, cross_device_ops, devices):
    self._testReductionAndBroadcast(cross_device_ops, devices)

  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `choose_the_best` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.ReductionToOneDevice)

  @combinations.generate(combinations.combine(
      mode=["graph", "eager"],
      required_gpus=1))
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_replica = value_lib.PerReplica((t0, t1))
    result = cross_device_ops_lib._simple_reduce(
        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)

  @combinations.generate(
      combinations.combine(
          cross_device_ops_instance=[
              combinations.NamedObject(
                  "ReductionToOneDevice",
                  cross_device_ops_lib.ReductionToOneDevice()),
              combinations.NamedObject(
                  "AllReduceCrossDeviceOps",
                  cross_device_ops_lib.AllReduceCrossDeviceOps())
          ],
          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
          batch_reduce=[True, False],
          mode=["graph", "eager"],
          required_gpus=1))
  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                 batch_reduce):
    devices = ["/cpu:0", "/gpu:0"]
    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                     reduce_op, batch_reduce)
Beispiel #9
0
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
    # TODO(yuefengz): decouple the num_gpus check from distribution in
    # combinations module so that we can pass in devices instead of a distribution
    # strategy.
    reduction_to_one_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "DefaultReductionToOneDevice",
                cross_device_ops_lib.ReductionToOneDevice()),
            combinations.NamedObject(
                "ReductionToCPUDeviceCrossDeviceOps",
                cross_device_ops_lib.ReductionToOneDevice(
                    reduce_to_device=_cpu_device)),
            combinations.NamedObject(
                "AccumulateNCrossDeviceOp",
                cross_device_ops_lib.ReductionToOneDevice(
                    accumulation_fn=math_ops.accumulate_n)),
        ],
        distribution=[
            strategy_combinations.one_device_strategy,
            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])
    allreduce_combinations = combinations.combine(
        cross_device_ops=[
            combinations.NamedObject(
                "AllReduce",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
            combinations.NamedObject(
                "AllReduceNoGradientRepacking",
                cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
            combinations.NamedObject("NcclAllReduce",
                                     cross_device_ops_lib.NcclAllReduce()),
            combinations.NamedObject(
                "HierarchicalCopy",
                cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
            combinations.NamedObject(
                "HierarchicalCopyAggregateSmallTensors",
                cross_device_ops_lib.AllReduceCrossDeviceOps(
                    "hierarchical_copy", 0, 100, 10))
        ],
        distribution=[
            strategy_combinations.mirrored_strategy_with_two_gpus,
        ],
        mode=["graph", "eager"])

    @combinations.generate(reduction_to_one_combinations +
                           allreduce_combinations)
    def testReductionAndBroadcast(self, cross_device_ops, distribution):
        with distribution.scope():
            self._testReductionAndBroadcast(cross_device_ops, distribution)

    def testChooseAlgorithm(self):
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                        [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if there are only 4 devices
        device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

        # if devices links contain each device itself
        device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                        [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                        [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
        self.assertEqual(result._num_packs, 8)

        # if not dgx1-like links
        device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                        [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
        result = cross_device_ops_lib._choose_all_reduce_algorithm(
            device_links)
        self.assertIsInstance(result,
                              cross_device_ops_lib.AllReduceCrossDeviceOps)
        self.assertEqual(result._all_reduce_alg, "nccl")
        self.assertEqual(result._num_packs, 1)

    @combinations.generate(
        combinations.combine(mode=["graph", "eager"], required_gpus=1))
    def testSimpleReduceWithIndexedSlices(self):
        devices = ["/cpu:0", "/gpu:0"]
        t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2],
                                  devices[1])
        per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices),
                                           (t0, t1))
        result = cross_device_ops_lib._simple_reduce(per_replica, devices[0],
                                                     math_ops.add_n,
                                                     reduce_util.ReduceOp.SUM)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices with and without duplicate indices.
        total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]],
                                               [1, 1, 3], [5, 2], devices[0])
        total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3],
                                                  [5, 2], devices[0])
        self._assert_indexed_slices_equal(total_with_dups, result)
        self._assert_indexed_slices_equal(total_without_dups, result)

    @combinations.generate(
        combinations.combine(
            cross_device_ops_instance=[
                combinations.NamedObject(
                    "ReductionToOneDevice",
                    cross_device_ops_lib.ReductionToOneDevice()),
                combinations.NamedObject(
                    "AllReduceCrossDeviceOps",
                    cross_device_ops_lib.AllReduceCrossDeviceOps())
            ],
            reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
            batch_reduce=[True, False],
            mode=["graph", "eager"],
            required_gpus=1))
    def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
                                         reduce_op, batch_reduce)