def testOneDevicePerWorker(self, input_type, api_type, iteration_type, enable_get_next_as_optional): if tf2.enabled(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(4) else: dataset_fn = lambda _: dataset_ops.Dataset.range(4) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 1)) worker_devices = self._cpu_devices() with context.graph_mode(), strategy.scope(), self.cached_session( ) as sess: if input_type == "dataset": # Autosharded expected_values = [[0, 1], [2, 3]] else: expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]] strategy.extended.experimental_enable_get_next_as_optional = ( enable_get_next_as_optional) self._test_input_iteration(input_type, api_type, iteration_type, dataset_or_input_fn, worker_devices, expected_values, strategy, sess=sess)
def testDifferentDatasets(self, input_type, api_type, iteration_type): def dataset_fn(ctx): if ctx.input_pipeline_id == 0: return dataset_ops.Dataset.range(8).batch(2) else: return dataset_ops.Dataset.range(9).batch(2) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_and_one_gpu_devices()[0][1] + self._cpu_and_one_gpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 2)) worker_devices = self._cpu_and_one_gpu_devices() with context.graph_mode(), strategy.scope(), self.cached_session() as sess: expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]], [[4, 5], [6, 7], [4, 5], [6, 7]], [[], [], [8], []]] strategy.extended.experimental_enable_get_next_as_optional = True self._test_input_iteration( input_type, api_type, iteration_type, dataset_or_input_fn, worker_devices, expected_values, strategy, sess=sess)
def testAutoshardingOption(self, input_type, api_type, iteration_type, auto_shard_policy): ds_option = dataset_ops.Options() ds_option.experimental_distribute.auto_shard_policy = auto_shard_policy if tf2.enabled(): dataset_fn = (lambda _: dataset_ops.DatasetV2.range(4). with_options(ds_option)) else: dataset_fn = ( lambda _: dataset_ops.Dataset.range(4).with_options(ds_option)) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 1)) worker_devices = self._cpu_devices() with context.graph_mode(), self.cached_session() as sess: if auto_shard_policy == AutoShardPolicy.AUTO: expected_values = [[0, 1], [2, 3]] else: expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]] self._test_input_iteration(input_type, api_type, iteration_type, dataset_or_input_fn, worker_devices, expected_values, strategy, sess)
def testUnevenDatasetBatches(self, input_type, api_type, iteration_type): strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_and_one_gpu_devices()[0][1] + self._cpu_and_one_gpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 2)) if tf2.enabled(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(9).batch(2) else: dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(2) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) worker_devices = self._cpu_and_one_gpu_devices() with context.graph_mode(), strategy.scope(), self.cached_session() as sess: if input_type == "dataset": # Autosharded expected_values = [[[0, 1], [4, 5], [2, 3], [6, 7]], [[8], [], [], []]] else: expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]], [[4, 5], [6, 7], [4, 5], [6, 7]], [[8], [], [8], []]] strategy.extended.experimental_enable_get_next_as_optional = True self._test_input_iteration( input_type, api_type, iteration_type, dataset_or_input_fn, worker_devices, expected_values, strategy, sess=sess)
def _initialize_multi_worker(self, devices): """Initializes the object for multi-worker training.""" self._local_mode = False assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)}) device_dict = _group_device_list(devices) self._workers = [] self._worker_devices = [] for job in ["chief", "worker"]: for task in range(len(device_dict.get(job, []))): worker = "/job:%s/task:%d" % (job, task) self._workers.append(worker) self._worker_devices.append((worker, device_dict[job][task])) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = self._workers[0] self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( self._workers, _infer_num_gpus_per_worker(self._devices))
def _configure(self, session_config=None, cluster_spec=None, task_type=None, task_id=None): del task_type, task_id if session_config: session_config.CopyFrom(self._update_config_proto(session_config)) if cluster_spec: self._initialize_multi_worker(self._num_gpus, cluster_spec) if self._cross_device_ops is None: if self._cluster_spec: # It currently cannot detect the toplogy of remote workers. So we # hard-code the multi-worker all-reduce algorithm for now. if len(self._workers) == 1: # The default is "nccl". self._cross_device_ops = ( cross_device_ops_lib.AllReduceCrossDeviceOps()) else: # The default is hierarchical reduce and broadcast. self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( self._workers, self._num_gpus) else: self._cross_device_ops = cross_device_ops_lib.choose_the_best( self._devices, session_config=session_config)
def testTwoDevicesPerWorker(self, input_type, api_type, iteration_type, enable_get_next_as_optional): strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_and_one_gpu_devices()[0][1] + self._cpu_and_one_gpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 2)) worker_devices = self._cpu_and_one_gpu_devices() with context.graph_mode(), strategy.scope(), self.cached_session( ) as sess: if tf2.enabled(): dataset_fn = lambda _: dataset_ops.DatasetV2.range(4) else: dataset_fn = lambda _: dataset_ops.Dataset.range(4) if input_type == "dataset": # Autosharded expected_values = [[0, 2, 1, 3]] else: expected_values = [[0, 1, 0, 1], [2, 3, 2, 3]] self._test_input_iteration( input_type, api_type, iteration_type, dataset_fn, worker_devices, expected_values, strategy, sess=sess, enable_get_next_as_optional=enable_get_next_as_optional)
def _initialize_multi_worker(self, devices): """Initializes the object for multi-worker training.""" self._local_mode = False assert devices, "Must specify at least one device." devices = tuple(device_util.resolve(d) for d in devices) assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument: %s" % devices) # TODO(josh11b): Require at least 2 devices? device_dict = _group_device_list(devices) workers = [] worker_devices = [] for job in ("chief", "worker"): for task in range(len(device_dict.get(job, []))): worker = "/job:%s/task:%d" % (job, task) workers.append(worker) worker_devices.append((worker, device_dict[job][task])) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. self._default_device = workers[0] self._host_input_device = numpy_dataset.SingleDevice(workers[0]) self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map, worker_devices) self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( workers, _infer_num_gpus_per_worker(devices))
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossDeviceOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2, ("pscpu/pscpu", 2, -1), 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2, ("pscpu/pscpu", 2, -1), 1)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0)), ], devices=[ [ "/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:1/device:CPU:0" ], [ "/job:worker/replica:0/task:0/device:GPU:0", "/job:worker/replica:0/task:1/device:GPU:0" ], [ "/job:worker/replica:0/task:0/device:GPU:0", "/job:worker/replica:0/task:0/device:GPU:1", "/job:worker/replica:0/task:1/device:GPU:0", "/job:worker/replica:0/task:1/device:GPU:1" ], ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): # Mimic the default device of multi-worker strategies. with ops.device("/job:worker/replica:0/task:0"): self._testReductionAndBroadcast(cross_device_ops, devices)
def get_strategy_object(strategy_cls): if strategy_cls == mirrored_strategy.MirroredStrategy: return strategy_cls( mirrored_strategy.all_local_devices(), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ['/job:worker/task:0', '/job:worker/task:1'], context.num_gpus())) else: # CollectiveAllReduceStrategy and ParameterServerStrategy. return strategy_cls()
def testTupleDataset(self, input_type, api_type, iteration_type, enable_get_next_as_optional): strategy = mirrored_strategy.MirroredStrategy( devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1"], 1)) worker_devices = self._cpu_devices() def dataset_fn(ctx): del ctx if tf2.enabled(): dataset1 = dataset_ops.DatasetV2.range(4) dataset2 = dataset_ops.DatasetV2.range(4).map(lambda x: x**2) return dataset_ops.DatasetV2.zip((dataset1, dataset2)) else: dataset1 = dataset_ops.Dataset.range(4) dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2) return dataset_ops.Dataset.zip((dataset1, dataset2)) dataset_or_input_fn = self._create_dataset_or_input_fn( input_type, dataset_fn) with context.graph_mode(), strategy.scope(), self.cached_session( ) as sess: if input_type == "dataset": # Autosharded expected_values = [[(0, 0), (1, 1)], [(2, 4), (3, 9)]] else: expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)] strategy.extended.experimental_enable_get_next_as_optional = ( enable_get_next_as_optional) self._test_input_iteration(input_type, api_type, iteration_type, dataset_or_input_fn, worker_devices, expected_values, strategy, sess=sess)
def _make_cross_device_ops(self, num_gpus_per_worker): return cross_device_ops_lib.MultiWorkerAllReduce( ["/job:worker/task:0", "/job:worker/task:1", "/job:worker/task:2"], num_gpus_per_worker)
def _make_cross_device_ops(self): return cross_device_ops_lib.MultiWorkerAllReduce( ["/job:chief/task:0", "/job:worker/task:0", "/job:worker/task:1"], context.num_gpus())
updated_var_values = self.evaluate(mock_model.variables) # All variables start at 1.0 and get two updates of 0.25. self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0]) self.assertAllEqual([0.5], updated_var_values[1]) @combinations.generate( combinations.combine( distribution=[ combinations.NamedDistribution( "Mirrored", # pylint: disable=g-long-lambda lambda: mirrored_strategy.MirroredStrategy( devices=mirrored_strategy.all_local_devices(), cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce([ "/job:worker/task:0", "/job:worker/task:1" ], context.num_gpus())), required_gpus=1) ], mode=["graph"])) class MultiWorkerMirroredStrategyTest( multi_worker_test_base.MultiWorkerTestBase, strategy_test_lib.DistributionTestBase): def _configure_distribution_strategy(self, distribution): cluster_spec = server_lib.ClusterSpec({ "worker": ["/job:worker/task:0", "/job:worker/task:1"] }) distribution.configure(cluster_spec=cluster_spec) def test_num_replicas_in_sync(self, distribution):
class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, CrossDeviceOpsTestBase): worker_devices = [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.NamedDistribution( "MirroredCPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 0), required_gpus=0), combinations.NamedDistribution( "Mirrored1GPU", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 1), required_gpus=1), combinations.NamedDistribution( "Mirrored2GPUs", lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker= 2), required_gpus=2), # pylint: disable=g-long-lambda combinations.NamedDistribution( "CoreMirroredCPU", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:CPU:0"]), required_gpus=0), combinations.NamedDistribution( "CoreMirrored1GPU", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:GPU:0"]), required_gpus=1), combinations.NamedDistribution( "CoreMirrored2GPUs", lambda: mirrored_strategy.CoreMirroredStrategy( ["/device:GPU:0", "/device:GPU:1"]), required_gpus=2), ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, distribution): distribution.configure( cluster_spec={ "worker": [ "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] }) with distribution.scope(): self._testReductionAndBroadcast(cross_device_ops, distribution)