class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossDeviceOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.NamedDistribution( "MirroredCPU", lambda: mirrored_strategy.MirroredStrategy(["/device:CPU:0"]), required_gpus=0), combinations.NamedDistribution( "Mirrored1GPU", lambda: mirrored_strategy.MirroredStrategy(["/device:GPU:0"]), required_gpus=1), combinations.NamedDistribution( "Mirrored2GPUs", # pylint: disable=g-long-lambda lambda: mirrored_strategy.MirroredStrategy( ["/device:GPU:0", "/device:GPU:1"]), required_gpus=2), ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): distribution.configure( cluster_spec={ "worker": [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] }) with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution)
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossDeviceOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], devices=[ [ "/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:1/device:CPU:0" ], [ "/job:worker/replica:0/task:0/device:GPU:0", "/job:worker/replica:0/task:1/device:GPU:0" ], [ "/job:worker/replica:0/task:0/device:GPU:0", "/job:worker/replica:0/task:0/device:GPU:1", "/job:worker/replica:0/task:1/device:GPU:0", "/job:worker/replica:0/task:1/device:GPU:1" ], ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): # Mimic the default device of multi-worker strategies. with ops.device("/job:worker/replica:0/task:0"): self._testReductionAndBroadcast(cross_device_ops, devices)
# Should call set_virtual_cpus_to_at_least(3) in your test's setUp methods. mirrored_strategy_with_cpu_1_and_2 = combinations.NamedDistribution( "Mirrored2CPU", lambda: mirrored_lib.MirroredStrategy(["/cpu:1", "/cpu:2"])) central_storage_strategy_with_two_gpus = combinations.NamedDistribution( "CentralStorage2GPUs", lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2), # pylint: disable=protected-access required_gpus=2) central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution( "CentralStorageCPUAndGPU", lambda: central_storage_strategy.CentralStorageStrategy( ["/gpu:0", "/cpu:0"]), required_gpus=1) gradient_descent_optimizer_v1_fn = combinations.NamedObject( "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2)) adagrad_optimizer_v1_fn = combinations.NamedObject( "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001)) adam_optimizer_v1_fn = combinations.NamedObject( "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1)) rmsprop_optimizer_v1_fn = combinations.NamedObject( "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001)) # TODO(shiningsun): consider adding the other v1 optimizers optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn] adadelta_optimizer_keras_v2_fn = combinations.NamedObject( "AdadeltaKerasV2", lambda: adadelta_keras_v2.Adadelta(0.001)) adagrad_optimizer_keras_v2_fn = combinations.NamedObject( "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Strategy and optimizer combinations for combinations.combine().""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from tensorflow.python.distribute import combinations from tensorflow.python.keras.distribute import simple_models simple_functional_model = combinations.NamedObject( "SimpleFunctionalModel", simple_models.SimpleFunctionalModel()) simple_sequential_model = combinations.NamedObject( "SimpleSequentialModel", simple_models.SimpleSequentialModel()) simple_subclass_model = combinations.NamedObject( "SimpleSubclassModel", simple_models.SimpleSubclassModel()) simple_tfmodule_model = combinations.NamedObject( "SimpleTFModuleModel", simple_models.SimpleTFModuleModel())
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): if isinstance( cross_device_ops._obj, # pylint: disable=protected-access cross_device_ops_lib.AllReduceCrossDeviceOps ) and context.executing_eagerly(): self.skipTest("b/149881884") self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `select_cross_device_ops` work with device strings # self.assertIsInstance( # cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.select_cross_device_ops(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce) @combinations.generate( combinations.combine( distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu, cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps("ring")) ], batch_reduce=[True, False], mode=["graph", "eager"])) def testReduceDistributedVariable(self, distribution, cross_device_ops_instance, batch_reduce): with distribution.scope(): v = variables.Variable(1.) if batch_reduce: result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN, [(v, v)])[0] else: result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v) for v in result.values: self.assertIsInstance(v, ops.Tensor) self.evaluate(variables.global_variables_initializer()) self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
class OpCancellationTest(test.TestCase, parameterized.TestCase): def setUp(self): _setup_context() super().setUp() @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortIfNoCollective(self, collective_op, device, communication): # Do not abort if there's no active collective ops. There could be # exceptions like EOF which we expect users to catch, aborting collective # ops on all op errors intervenes with this workflow. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 dataset = dataset_ops.Dataset.from_tensors([1.]) @def_function.function def collective_fn(in_tensor): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @def_function.function def f(): iterator = iter(dataset) collective_fn(next(iterator)) # This next(iterator) should raise EOF. collective_fn(next(iterator)) with self.assertRaises(errors.OutOfRangeError): f() collective_fn(constant_op.constant([1.])) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), ], mode='eager'), device_combination)) def testOpErrorAbortWithCollective(self, collective_op, device, communication): # Abort v1 collective ops if there're active collective ops at the time of # an op error. This is due to the inability to cancel collective ops, and op # errors may cause running collective ops to hang. dev0 = '/device:%s:0' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test abortion # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Now collective ops is aborted, subsequent collective ops should fail with # the previous error. with self.assertRaises(errors.CancelledError): with ops.device(dev0): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortWithCollective(self, collective_op, device, communication): # Do not abort v2 collective ops even if there're active collective ops at # the time of an op error. We rely cancellation to terminate active # collective ops. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) @def_function.function def collective_fn(): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) # Local params resolution cannot be cancelled yet, so we perform a normal # collective so that the group is resolved. collective_fn() # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test cancellation # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Collective ops shouldn't be aborted and new collectives should be able to # proceed. collective_fn() @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testCancelDuringParamResolution(self, collective_op, device, communication): dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) t1_cancellation_manager = cancellation.CancellationManager() t2_cancellation_manager = cancellation.CancellationManager() @def_function.function def _collective_fn(x): # Run an assertion to crash one of the two function executions running # collectives. We explicitly cancel the other in response. assert_op = check_ops.assert_equal(x, in_tensor) with ops.control_dependencies([assert_op]): return collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) collective_concrete = _collective_fn.get_concrete_function(in_tensor) finish_mu = threading.Lock() finishes = 0 def _placement_wrapper(device, x, my_cancellation, other_cancellation): try: with ops.device(device): cancelable_collective = my_cancellation.get_cancelable_function( collective_concrete) return cancelable_collective(x) except errors.InvalidArgumentError: # `assert_equal` failed for this execution of the function. The other # function would deadlock without cancellation. other_cancellation.start_cancel() except errors.CancelledError: pass nonlocal finishes with finish_mu: finishes += 1 t1 = threading.Thread(target=_placement_wrapper, args=(dev0, constant_op.constant([1.]), t1_cancellation_manager, t2_cancellation_manager)) t2 = threading.Thread( target=_placement_wrapper, # Will cause the assertion to fail args=(dev1, constant_op.constant([2.]), t2_cancellation_manager, t1_cancellation_manager)) t1.start() t2.start() t1.join() t2.join() self.assertEqual(finishes, 2)
group_size = array_ops.identity(group_size) group_key = array_ops.identity(group_key) instance_key = array_ops.identity(instance_key) shape = array_ops.identity(shape) return _collective_ops.broadcast_recv_v2(shape, dtype, group_size, group_key, instance_key, *args, **kwargs) device_combination = ( combinations.combine(device='CPU', communication='RING', required_gpus=0) + combinations.combine( device='GPU', communication=['RING', 'NCCL'], required_gpus=2)) collective_op_combinations = combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather) ]) @combinations.generate( combinations.times( combinations.combine(collective_ops=[ combinations.NamedObject('v1', CollectiveOpsV1), combinations.NamedObject('v2', CollectiveOpsV2) ], mode='eager'), device_combination)) class CollectiveOpsTest(test.TestCase, parameterized.TestCase): def setUp(self):
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject("DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.add_n)), ], devices=[ ["/cpu:0"], ["/cpu:0", "/gpu:0"], ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], devices=[ ["/gpu:0", "/gpu:1"], ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): self._testReductionAndBroadcast(cross_device_ops, devices) def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.choose_the_best(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `choose_the_best` work with device strings # self.assertIsInstance( # cross_device_ops_lib.choose_the_best(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.ReductionToOneDevice) @combinations.generate(combinations.combine( mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica((t0, t1)) result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices( [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices( [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)
class OpCancellationTest(test.TestCase, parameterized.TestCase): def setUp(self): _setup_context() super().setUp() @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortIfNoCollective(self, collective_op, device, communication): # Do not abort if there's no active collective ops. There could be # exceptions like EOF which we expect users to catch, aborting collective # ops on all op errors intervenes with this workflow. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 dataset = dataset_ops.Dataset.from_tensors([1.]) @def_function.function def collective_fn(in_tensor): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @def_function.function def f(): iterator = iter(dataset) collective_fn(next(iterator)) # This next(iterator) should raise EOF. collective_fn(next(iterator)) with self.assertRaises(errors.OutOfRangeError): f() collective_fn(constant_op.constant([1.])) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), combinations.NamedObject('all_gather', CollectiveOpsV1.all_gather), ], mode='eager'), device_combination)) def testOpErrorAbortWithCollective(self, collective_op, device, communication): # Abort v1 collective ops if there're active collective ops at the time of # an op error. This is due to the inability to cancel collective ops, and op # errors may cause running collective ops to hang. dev0 = '/device:%s:0' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test abortion # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Now collective ops is aborted, subsequent collective ops should fail with # the previous error. with self.assertRaises(errors.CancelledError): with ops.device(dev0): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) @combinations.generate( combinations.times( combinations.combine(collective_op=[ combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), combinations.NamedObject('all_gather_v2', CollectiveOpsV2.all_gather), ], mode='eager'), device_combination)) def testOpErrorNotAbortWithCollective(self, collective_op, device, communication): # Do not abort v2 collective ops even if there're active collective ops at # the time of an op error. We rely cancellation to terminate active # collective ops. dev0 = '/device:%s:0' % device dev1 = '/device:%s:1' % device group_size = 2 group_key = 100 instance_key = 100 in_tensor = constant_op.constant([1.]) @def_function.function def collective_fn(): for device in [dev0, dev1]: with ops.device(device): collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) # Local params resolution cannot be cancelled yet, so we perform a normal # collective so that the group is resolved. collective_fn() # Make the dataset sleep a while so that the collective is being executed # when the EOF happens. dataset = dataset_ops.Dataset.from_tensors([1.]).apply( dataset_testing.sleep(sleep_microseconds=200)) @def_function.function def f(): # Launch a collective op that won't be able to finish to test cancellation # when other ops error. with ops.device(dev0): ret = collective_op(in_tensor, group_size, group_key, instance_key, communication_hint=communication) iterator = iter(dataset) next(iterator) # This should raise EOF. next(iterator) return ret with self.assertRaises(errors.OutOfRangeError): f() # Collective ops shouldn't be aborted and new collectives should be able to # proceed. collective_fn()
class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): # TODO(yuefengz): decouple the num_gpus check from distribution in # combinations module so that we can pass in devices instead of a distribution # strategy. reduction_to_one_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "DefaultReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", cross_device_ops_lib.ReductionToOneDevice( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", cross_device_ops_lib.ReductionToOneDevice( accumulation_fn=math_ops.accumulate_n)), ], distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "AllReduce", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject("NcclAllReduce", cross_device_ops_lib.NcclAllReduce()), combinations.NamedObject( "HierarchicalCopy", cross_device_ops_lib.HierarchicalCopyAllReduce(8)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], distribution=[ strategy_combinations.mirrored_strategy_with_two_gpus, ], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution) def testChooseAlgorithm(self): device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if there are only 4 devices device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) # if devices links contain each device itself device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7], [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if not dgx1-like links device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]] result = cross_device_ops_lib._choose_all_reduce_algorithm( device_links) self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) @combinations.generate( combinations.combine(mode=["graph", "eager"], required_gpus=1)) def testSimpleReduceWithIndexedSlices(self): devices = ["/cpu:0", "/gpu:0"] t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), (t0, t1)) result = cross_device_ops_lib._simple_reduce(per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated # IndexedSlices with and without duplicate indices. total_with_dups = _make_indexed_slices([[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0]) total_without_dups = _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0]) self._assert_indexed_slices_equal(total_with_dups, result) self._assert_indexed_slices_equal(total_without_dups, result) @combinations.generate( combinations.combine( cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDevice", cross_device_ops_lib.ReductionToOneDevice()), combinations.NamedObject( "AllReduceCrossDeviceOps", cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance, reduce_op, batch_reduce)
from tensorflow.python.eager import test from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import variables mirrored_strategy_with_gpu_and_cpu = combinations.NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]), required_gpus=1) mirrored_strategy_with_two_gpus = combinations.NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]), required_gpus=2) # pylint: disable=g-long-lambda gradient_descent_optimizer_v2_fn = combinations.NamedObject( "GradientDescentV2", lambda: gradient_descent_v2.GradientDescentOptimizer(0.2)) adagrad_optimizer_v2_fn = combinations.NamedObject( "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001)) optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn] def distributions_and_v2_optimizers(): """DistributionStrategies and V2 Optimizers.""" return combinations.combine(distribution=[ strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, ], optimizer_fn=optimizers_v2)