def test_dataset_creator_usage_in_parameter_server_model_fit(self): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=1, rpc_layer="grpc") cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")) with strategy.scope(): model = sequential.Sequential([core_layers.Dense(10)]) model.compile(gradient_descent.SGD(), loss="mse") def dataset_fn(input_context): global_batch_size = 64 batch_size = input_context.get_per_replica_batch_size( global_batch_size) dataset = dataset_ops.DatasetV2.from_tensors(([1.], [1.])).repeat() dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(2) return dataset history = model.fit(dataset_creator.DatasetCreator(dataset_fn), epochs=10, steps_per_epoch=10, verbose=0) self.assertLen(history.history["loss"], 10)
def make_parameter_server_cluster(num_workers, num_ps): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
def testEagerCustomTrainingUnimplementedError(self): cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2) cluster_resolver = SimpleClusterResolver( cluster_spec=multi_worker_util.normalize_cluster_spec( cluster_spec), task_type='worker', task_id=1, num_accelerators={'GPU': 0}) strategy = parameter_server_strategy.ParameterServerStrategy( cluster_resolver) dataset = dataset_ops.DatasetV2.from_tensor_slices([5., 6., 7., 8.]) def train_step(data): return math_ops.square(data) self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*', strategy.experimental_distribute_dataset, dataset.batch(2)) self.assertRaisesRegex( NotImplementedError, 'ParameterServerStrategy*', strategy.experimental_distribute_datasets_from_function, lambda _: dataset) self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*', strategy.scope) self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*', strategy.run, train_step)
def setUpClass(cls): super(ParameterServerStrategyV2Test, cls).setUpClass() cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=3) cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def)) remote.connect_to_cluster(cls.cluster_resolver.cluster_spec(), job_name="chief")
def make_coordinator(num_workers, num_ps): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc") return tf.distribute.experimental.coordinator.ClusterCoordinator( tf.distribute.experimental.ParameterServerStrategy(cluster_resolver))
def _create_parameter_server(): if framework_test_util.is_xla_enabled(): # To address test failures resulting in XLA with MultiProcessRunner, # continue to use in-process cluster for XLA tests. cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, rpc_layer="grpc") return _create_ps_strategy(resolver, variable_partitioner) else: tf_config = cluster_resolver.TFConfigClusterResolver() cluster_def = tf_config.cluster_spec().as_dict() if not cluster_def: # When MultiProcessRunner cluster is used, the cluster is not created # initially when the decorator is called. When the test runs, initially # this method is invoked via decorator before setting up the # MultiProcessRunner with worker and ps in the combinations.py. After # setup is done, the subprocess invokes this method again to get # strategy object. We return None strategy when the main thread invokes # this method before setting up cluster. # Returning None is fine here, since this thread will proceed to create # MultiProcessRunner and invoke tests with decorator inside # subprocesses. return None # MultiProcessRunner is already setup and this method is invoked from a # subprocess running the actual test. resolver = cluster_resolver.SimpleClusterResolver( server_lib.ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, task_type=tf_config.task_type, task_id=tf_config.task_id, environment=tf_config.environment, rpc_layer=tf_config.rpc_layer or "grpc") if tf_config.task_type in ("worker", "ps"): worker_config = config_pb2.ConfigProto() worker_config.inter_op_parallelism_threads = 4 # max num_workers + 1 try: server = server_lib.Server(cluster_def, job_name=tf_config.task_type, task_index=tf_config.task_id, protocol="grpc", config=worker_config) except errors.UnknownError as e: if "Could not start gRPC server" in e.message: raise unittest.SkipTest("Cannot start std servers.") else: raise # Blocking the process that starts a server from exiting. server.join() return _create_ps_strategy(resolver, variable_partitioner)
def make_client(num_workers, num_ps): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc") return client_lib.Client( parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver))
def make_coordinator(num_workers, num_ps): # TODO(rchao): Test the internal rpc_layer version. cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer='grpc') cluster_def['chief'] = [ 'localhost:%d' % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver( ClusterSpec(cluster_def), rpc_layer='grpc') strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver) return coordinator_lib.ClusterCoordinator(strategy)
def _create_parameter_server(): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") resolver = cluster_resolver.SimpleClusterResolver( ClusterSpec(cluster_def), num_accelerators={"GPU": required_gpus}, rpc_layer="grpc") strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( resolver, variable_partitioner=sharded_variable.FixedShardsPartitioner(2)) return strategy
def get_cluster_def(num_workers, num_ps): if num_workers > MAX_NUM_WORKER or num_ps > MAX_NUM_PS: raise ValueError("Requesting more servers than the maximum, adjust" "MAX_NUM_PS and MAX_NUM_WORKER") global _cluster if _cluster is None: _cluster = multi_worker_test_base.create_in_process_cluster( num_workers=MAX_NUM_WORKER, num_ps=MAX_NUM_PS) return { "worker": _cluster["worker"][:num_workers], "ps": _cluster["ps"][:num_ps], }
def get_cluster_def(cluster_params, num_workers, num_ps): if (num_workers > cluster_params.max_num_worker or num_ps > cluster_params.max_num_ps): raise ValueError("Requesting more servers than the maximum, adjust" "cluster params' max_num_ps and max_num_worker") if cluster_params.cluster is None: cluster_params.cluster = multi_worker_test_base.create_in_process_cluster( num_workers=cluster_params.max_num_worker, num_ps=cluster_params.max_num_ps) return { "worker": cluster_params.cluster["worker"][:num_workers], "ps": cluster_params.cluster["ps"][:num_ps], }
def test_dataset_creator_usage_in_parameter_server_model_fit(self): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=1, rpc_layer="grpc") strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")) with strategy.scope(): model = sequential.Sequential([core_layers.Dense(10)]) model.compile(gradient_descent.SGD(), loss="mse") history = model.fit(dataset_creator.DatasetCreator( self._get_dataset_fn()), epochs=10, steps_per_epoch=10, verbose=0) self.assertLen(history.history["loss"], 10)
def testClientMetrics(self): if sys.version_info >= (3, 8) and platform.system() == 'Windows': # TODO(b/165013260): Fix this self.skipTest( 'Test is currently broken on Windows with Python 3.8') metric_utils.enable_metrics = True cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=1, num_ps=1, rpc_layer=self.get_rpc_layer()) cluster_def['chief'] = [ 'localhost:%d' % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver( ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer()) strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver) cluster = client.Cluster(strategy) @def_function.function def func(): time.sleep(0.5) return 3 result = cluster.schedule(func, args=None, kwargs=None) result = cluster.schedule(func, args=None, kwargs=None) cluster.join() self.assertEqual(result._get_value().numpy(), 3) # Tracing, closure execution, and remote_value fetching should be executed # exactly once for running this function. metric_tracing = metric_utils.get_metric_summary('function_tracing') self.assertEqual(metric_tracing['num'], 1) # Tracing time should be longer than the sleep time in Python function. self.assertGreater(metric_tracing['sum'], 0.5) metric_closure = metric_utils.get_metric_summary('closure_execution') self.assertEqual(metric_closure['num'], 2) metric_remote_value = metric_utils.get_metric_summary( 'remote_value_fetch') self.assertEqual(metric_remote_value['num'], 2)
def make_parameter_server_cluster(num_workers, num_ps): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") return SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc")
def setUpClass(cls): super().setUpClass() cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=2) cls.cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver( tf.train.ClusterSpec(cluster_def))
def setUpClass(cls): """Create a local cluster with 3 workers.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=NUM_WORKERS, num_ps=0)
def setUpClass(cls): """Create a local cluster with 3 workers and 1 chief.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=0, has_chief=True)
def setUpClass(cls): """Create a local cluster with 2 workers.""" super(KerasMultiWorkerTestStandaloneClient, cls).setUpClass() cls._cluster_spec = test_base.create_in_process_cluster( num_workers=2, num_ps=1, has_eval=False)
def _get_parameter_server_strategy(self): cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=1, rpc_layer="grpc") return tf.distribute.experimental.ParameterServerStrategy( SimpleClusterResolver(ClusterSpec(cluster_def), rpc_layer="grpc"))
def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_chief=True) cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2) cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2) cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
def setUpClass(cls): cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_chief=True) cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
def setUpClass(cls): """Create a local cluster with 2 workers.""" super(DistributeCoordinatorIntegrationTest, cls).setUpClass() cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_eval=True)
def setUpClass(cls): super(VariablePartitioningTest, cls).setUpClass() cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=2) cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
def setUpClass(cls): """Create a local cluster with 2 workers.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_eval=True)
def setUpClass(cls): """Create a local cluster with 3 workers and 1 chief.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=0, has_chief=True)
def setUpClass(cls): """Create a local cluster with 2 workers and 1 chief.""" cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=0, has_chief=True) cls._default_target = "grpc://" + cls._cluster_spec["chief"][0]
def setUpClass(cls): """Create a local cluster with 2 workers.""" super(DistributeCoordinatorIntegrationTest, cls).setUpClass() cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2, has_eval=True)
def setUpClass(cls): super(ParameterServerStrategyV2Test, cls).setUpClass() cluster_def = multi_worker_test_base.create_in_process_cluster( num_workers=2, num_ps=3) cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))