def __init__(self, container_strategy, cluster_resolver, variable_partitioner): """Initialization of ParameterServerStrategyV2Extended.""" super(ParameterServerStrategyV2Extended, self).__init__(container_strategy) self._num_ps = len(cluster_resolver.cluster_spec().as_dict().get( "ps", [])) self._num_workers = len(cluster_resolver.cluster_spec().as_dict().get( "worker", [])) self._variable_count = 0 self._variable_partitioner = variable_partitioner # The following two attrs are to verify that `ParameterServerStrategy` # methods are properly used with a `ClusterCoordinator`. self._used_with_coordinator = False self._being_scheduled = False self._set_num_gpus() distribute_lib.distribution_strategy_replica_gauge.get_cell( "num_gpus_per_worker").set(self._num_gpus_per_worker) # Don't canonicalize the devices here since this code is executed on Chief, # but we want the reduce evaluation to be done on each worker. Placer will # automatically choose the right device based on current context. # TODO(ishark): Use select_cross_device_ops instead. self._cross_device_ops = cross_device_ops_lib.ReductionToOneDevice( reduce_to_device="/device:CPU:0") self._cross_device_ops._canonicalize_devices = False # pylint: disable=protected-access self._allow_run_without_coordinator = False self._coordinator_creation_lock = threading.Lock()
def __init__(self, container_strategy, cluster_resolver=TFConfigClusterResolver()): super(ParameterServerStrategyExtended, self).__init__(container_strategy) self._initialize_strategy(cluster_resolver) # We typically don't need to do all-reduce in this strategy. self._cross_device_ops = ( cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
def _get_cross_device_ops(self, value): if self._collective_ops_in_use: if isinstance(value, values.DistributedValues): value_int32 = True in { dtypes.as_dtype(v.dtype) == dtypes.int32 for v in value.values } else: value_int32 = dtypes.as_dtype(value.dtype) == dtypes.int32 if value_int32: return cross_device_ops_lib.ReductionToOneDevice() return self._cross_device_ops or self._inferred_cross_device_ops
def __init__(self, container_strategy, cluster_resolver=None, compute_devices=None, parameter_device=None): super(ParameterServerStrategyExtended, self).__init__(container_strategy) self._initialize_strategy(cluster_resolver=cluster_resolver, compute_devices=compute_devices, parameter_device=parameter_device) # We typically don't need to do all-reduce in this strategy. self._cross_device_ops = (cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_LOCAL_CPU))
def get_values(value): if not isinstance(value, values.DistributedValues): # This function handles reducing values that are not PerReplica or # Mirrored values. For example, the same value could be present on all # replicas in which case `value` would be a single value or value could # be 0. return cross_device_ops_lib.reduce_non_distributed_value( reduce_op, value, destinations, self._num_replicas_in_sync) if self._use_merge_call() and self._collective_ops_in_use and (( not cross_device_ops_lib._devices_match(value, destinations) or # pylint: disable=protected-access any("cpu" in d.lower() for d in cross_device_ops_lib.get_devices_from(destinations)))): return cross_device_ops_lib.ReductionToOneDevice().reduce( reduce_op, value, destinations) return self._get_cross_device_ops(value).reduce( reduce_op, value, destinations=destinations, options=self._communication_options.merge(options))
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): if isinstance( cross_device_ops._obj, # pylint: disable=protected-access cross_device_ops_lib.AllReduceCrossDeviceOps ) and context.executing_eagerly(): self.skipTest("b/149881884") self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `select_cross_device_ops` work with device strings # self.assertIsInstance( # cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce) @combinations.generate( combinations.combine( distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu, cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps("ring")) ], batch_reduce=[True, False], mode=["graph", "eager"])) def testReduceDistributedVariable(self, distribution, cross_device_ops_instance, batch_reduce): with distribution.scope(): v = variables.Variable(1.) if batch_reduce: result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN, [(v, v)])[0] else: result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v) for v in result.values: self.assertIsInstance(v, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
def _make_cross_device_ops(self, num_gpus_per_worker): return cross_device_ops_lib.ReductionToOneDevice()
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.choose_the_best(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `choose_the_best` work with device strings # self.assertIsInstance( # cross_device_ops_lib.choose_the_best(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): # TODO(yuefengz): decouple the num_gpus check from distribution in # combinations module so that we can pass in devices instead of a distribution # strategy. reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.accumulate_n)), ], distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], distribution=[ strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution) def testChooseAlgorithm(self): device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if there are only 4 devices device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) # if devices links contain each device itself device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7], [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if not dgx1-like links device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) @combinations.generate( combinations.combine(mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), (t0, t1)) result = cross_device_ops_lib._simple_reduce(per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)