def testRetainSparseJobWithNoMerging(self): base_cluster_spec = server_lib.ClusterSpec({ "worker": { 1: "worker0:2222", 3: "worker1:2222", 5: "worker2:2222" } }) base_cluster_resolver = SimpleClusterResolver(base_cluster_spec) union_cluster = UnionClusterResolver(base_cluster_resolver) cluster_spec = union_cluster.cluster_spec() expected_proto = """ job { name: 'worker' tasks { key: 1 value: 'worker0:2222' } tasks { key: 3 value: 'worker1:2222' } tasks { key: 5 value: 'worker2:2222' } } """ self._verifyClusterSpecEquality(cluster_spec, expected_proto)
def test_update_config_proto(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) tpu_strategy_util.initialize_tpu_system(resolver) strategy = tpu_lib.TPUStrategyV2(resolver) config_proto = config_pb2.ConfigProto() cluster_spec = server_lib.ClusterSpec({"worker": ["fake1", "fake2"]}) with test.mock.patch.object(resolver, "cluster_spec", return_value=cluster_spec): new_config = strategy.update_config_proto(config_proto) # Verify cluster_def. self.assertProtoEquals(cluster_spec.as_cluster_def(), new_config.cluster_def) # Verify isolate_session_state self.assertTrue(new_config.isolate_session_state)
def testClusterSpecAccessors(self): original_dict = { "ps": ["ps0:2222", "ps1:2222"], "worker": ["worker0:2222", "worker1:2222", "worker2:2222"], "sparse": { 0: "sparse0:2222", 3: "sparse3:2222" } } cluster_spec = server_lib.ClusterSpec(original_dict) self.assertEqual(original_dict, cluster_spec.as_dict()) self.assertEqual(2, cluster_spec.num_tasks("ps")) self.assertEqual(3, cluster_spec.num_tasks("worker")) self.assertEqual(2, cluster_spec.num_tasks("sparse")) with self.assertRaises(ValueError): cluster_spec.num_tasks("unknown") self.assertEqual("ps0:2222", cluster_spec.task_address("ps", 0)) self.assertEqual("sparse0:2222", cluster_spec.task_address("sparse", 0)) with self.assertRaises(ValueError): cluster_spec.task_address("unknown", 0) with self.assertRaises(ValueError): cluster_spec.task_address("sparse", 2) self.assertEqual([0, 1], cluster_spec.task_indices("ps")) self.assertEqual([0, 1, 2], cluster_spec.task_indices("worker")) self.assertEqual([0, 3], cluster_spec.task_indices("sparse")) with self.assertRaises(ValueError): cluster_spec.task_indices("unknown") # NOTE(mrry): `ClusterSpec.job_tasks()` is not recommended for use # with sparse jobs. self.assertEqual(["ps0:2222", "ps1:2222"], cluster_spec.job_tasks("ps")) self.assertEqual(["worker0:2222", "worker1:2222", "worker2:2222"], cluster_spec.job_tasks("worker")) self.assertEqual(["sparse0:2222", None, None, "sparse3:2222"], cluster_spec.job_tasks("sparse")) with self.assertRaises(ValueError): cluster_spec.job_tasks("unknown")
def testPS2TasksWithDevice(self): cluster_spec = server_lib.ClusterSpec({ "sun": ["sun0:2222", "sun1:2222", "sun2:2222"], "moon": ["moon0:2222", "moon1:2222"] }) with ops.device( device_setter.replica_device_setter( ps_device="/job:moon", worker_device="/job:sun", cluster=cluster_spec.as_cluster_def())): v = variables.Variable([1, 2]) w = variables.Variable([2, 1]) a = v + w self.assertDeviceEqual("/job:moon/task:0", v.device) self.assertDeviceEqual("/job:moon/task:0", v.initializer.device) self.assertDeviceEqual("/job:moon/task:1", w.device) self.assertDeviceEqual("/job:moon/task:1", w.initializer.device) self.assertDeviceEqual("/job:sun", a.device)
def cluster_spec(self): """Returns a ClusterSpec object based on the latest info from Kubernetes. We retrieve the information from the Kubernetes master every time this method is called. Returns: A ClusterSpec containing host information returned from Kubernetes. Raises: RuntimeError: If any of the pods returned by the master is not in the `Running` phase. """ if self._override_client: client = self._override_client else: from kubernetes import config as k8sconfig # pylint: disable=g-import-not-at-top from kubernetes import client as k8sclient # pylint: disable=g-import-not-at-top k8sconfig.load_kube_config() client = k8sclient.CoreV1Api() cluster_map = {} for tf_job in self._job_to_label_mapping: all_pods = [] for selector in self._job_to_label_mapping[tf_job]: ret = client.list_pod_for_all_namespaces(label_selector=selector) selected_pods = [] # Sort the list by the name to make sure it doesn't change call to call. for pod in sorted(ret.items, key=lambda x: x.metadata.name): if pod.status.phase == 'Running': selected_pods.append( '%s:%s' % (pod.status.host_ip, self._tf_server_port)) else: raise RuntimeError('Pod "%s" is not running; phase: "%s"' % (pod.metadata.name, pod.status.phase)) all_pods.extend(selected_pods) cluster_map[tf_job] = all_pods return server_lib.ClusterSpec(cluster_map)
def testOverrideSimpleClusterResolver(self): base_cluster_spec = server_lib.ClusterSpec({ "ps": ["ps0:2222", "ps1:2222"], "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] }) simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps", task_index=1, environment="cloud", num_accelerators=8, rpc_layer="grpc") simple_resolver.task_type = "worker" simple_resolver.task_index = 2 simple_resolver.rpc_layer = "http" self.assertEqual(simple_resolver.task_type, "worker") self.assertEqual(simple_resolver.task_index, 2) self.assertEqual(simple_resolver.rpc_layer, "http")
def testPS2TasksWithClusterSpecClass(self): cluster_spec = server_lib.ClusterSpec({ "ps": ["ps0:2222", "ps1:2222"], "worker": ["worker0:2222", "worker1:2222", "worker2:2222"] }) ea_coustom = ElasticAverageCustomGetter( worker_device="/job:worker/task:0") from tensorflow.python.training import device_setter with ops.device( device_setter.replica_device_setter(cluster=cluster_spec, worker_device="/job:worker/task:0", ps_device="/job:ps")), \ variable_scope.variable_scope("", custom_getter=ea_coustom): v = variable_scope.get_variable(initializer=[1, 2], name="v") w = variable_scope.get_variable(initializer=[2, 1], name="w") v_g, w_g = ea_coustom._global_map[v], ea_coustom._global_map[w] self.assertDeviceEqual("/job:worker/task:0", v.device) self.assertDeviceEqual("job:ps/task:0", v_g.device) self.assertDeviceEqual("/job:worker/task:0", w.device) self.assertDeviceEqual("job:ps/task:1", w_g.device)
def create_multi_process_cluster(num_workers, num_ps, has_chief=False, has_eval=False, rpc_layer='grpc', stream_output=False, collective_leader=None): cluster_spec = create_cluster_spec( has_chief=has_chief, num_workers=num_workers, num_ps=num_ps, has_eval=has_eval) cluster = MultiProcessCluster( SimpleClusterResolver( server_lib.ClusterSpec(cluster_spec), rpc_layer=rpc_layer), stream_output=stream_output, collective_leader=collective_leader) cluster.start() return cluster
def benchmarkPartitionedVariables(self): def _input_fn(): features = { 'language': sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } labels = constant_op.constant(((1, ), (0, ), (0, ))) return features, labels # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_feature = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) embedding_feature = feature_column.embedding_column(sparse_feature, dimension=1) tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=(sparse_feature, ), dnn_feature_columns=(embedding_feature, ), dnn_hidden_units=(3, 3), config=config) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(input_fn=_input_fn, steps=100) self._assertCommonMetrics(metrics)
def setUp(self, num_workers, num_ps): super(BaseFaultToleranceTest, self).setUp() self._cluster = multi_worker_test_base.create_multi_process_cluster( num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc") self._cluster_def = self._cluster.cluster_resolver.cluster_spec().as_dict() self._cluster_def["chief"] = [ "localhost:%d" % multi_worker_test_base.pick_unused_port() ] cluster_resolver = SimpleClusterResolver( server_lib.ClusterSpec(self._cluster_def), rpc_layer="grpc") # The strategy's constructor would connect to the cluster. self.strategy = parameter_server_strategy_v2.ParameterServerStrategyV2( cluster_resolver) self.cluster_coord = cluster_coordinator.ClusterCoordinator(self.strategy) self.thread_coord = thread_coordinator.Coordinator( clean_stop_exception_types=[]) self.num_workers = num_workers self.num_ps = num_ps
def normalize_cluster_spec(cluster_spec): """Makes `cluster_spec` into a `ClusterSpec` object. Args: cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the cluster configurations. Returns: a `ClusterSpec` object. Raises: ValueError: if `cluster_spec` is not a dict or a `ClusterSpec` or a `ClusterDef`. """ if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)): return server_lib.ClusterSpec(cluster_spec) elif not isinstance(cluster_spec, server_lib.ClusterSpec): raise ValueError( "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " "`tf.train.ClusterDef` object") return cluster_spec
def create_local_cluster(num_workers, num_ps, protocol="grpc"): """Create local GRPC servers and return them.""" worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)] ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)] cluster_dict = { "worker": ["localhost:%s" % port for port in worker_ports], "ps": ["localhost:%s" % port for port in ps_ports] } cs = server_lib.ClusterSpec(cluster_dict) workers = [ server_lib.Server( cs, job_name="worker", protocol=protocol, task_index=ix, start=True) for ix in range(num_workers) ] ps_servers = [ server_lib.Server( cs, job_name="ps", protocol=protocol, task_index=ix, start=True) for ix in range(num_ps) ] return cluster_dict, workers, ps_servers
def testSetConfiguration(self): config = config_pb2.ConfigProto( gpu_options=config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.1)) # Configure a server using the default local server options. server = server_lib.Server.create_local_server(config=config, start=False) self.assertEqual(0.1, server.server_def.default_session_config.gpu_options. per_process_gpu_memory_fraction) # Configure a server using an explicit ServerDefd with an # overridden config. cluster_def = server_lib.ClusterSpec({ "localhost": ["localhost:0"] }).as_cluster_def() server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_def, job_name="localhost", task_index=0, protocol="grpc") server = server_lib.Server(server_def, config=config, start=False) self.assertEqual(0.1, server.server_def.default_session_config.gpu_options. per_process_gpu_memory_fraction)
def _setupCluster(self): def get_open_port(): try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) except IOError: s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) s.bind(("", 0)) port = s.getsockname()[1] s.close() return port port1 = get_open_port() port2 = get_open_port() cs = server_lib.ClusterSpec({ "worker": ["localhost:%s" % port1], "ps": ["localhost:%s" % port2] }) worker = server_lib.Server(cs, job_name="worker", start=True) ps = server_lib.Server(cs, job_name="ps", start=True) return worker, ps
def testEq(self): self.assertEquals(server_lib.ClusterSpec({}), server_lib.ClusterSpec({})) self.assertEquals( server_lib.ClusterSpec({ "job": ["host:2222"] }), server_lib.ClusterSpec({ "job": ["host:2222"] }),) self.assertEquals( server_lib.ClusterSpec({ "job": { 0: "host:2222" } }), server_lib.ClusterSpec({ "job": ["host:2222"] }))
def connect_to_remote_host(remote_host=None, job_name="worker"): """Connects to a single machine to enable remote execution on it. Will make devices on the remote host available to use. Note that calling this more than once will work, but will invalidate any tensor handles on the old remote devices. Using the default job_name of worker, you can schedule ops to run remotely as follows: ```python # Enable eager execution, and connect to the remote host. tf.compat.v1.enable_eager_execution() tf.contrib.eager.connect_to_remote_host("exampleaddr.com:9876") with ops.device("job:worker/replica:0/task:1/device:CPU:0"): # The following tensors should be resident on the remote device, and the op # will also execute remotely. x1 = array_ops.ones([2, 2]) x2 = array_ops.ones([2, 2]) y = math_ops.matmul(x1, x2) ``` Args: remote_host: a single or a list the remote server addr in host-port format. job_name: The job name under which the new server will be accessible. Raises: ValueError: if remote_host is None. """ if not remote_host: raise ValueError("Must provide at least one remote_host") remote_hosts = nest.flatten(remote_host) cluster_spec = server_lib.ClusterSpec({ job_name: [_strip_prefix(host, _GRPC_PREFIX) for host in remote_hosts] }) connect_to_cluster(cluster_spec)
def testSerialize(self): # pylint: disable=g-import-not-at-top try: import portpicker except ImportError: return with context.graph_mode(): worker_port = portpicker.pick_unused_port() ps_port = portpicker.pick_unused_port() cluster_dict = { "worker": ["localhost:%s" % worker_port], "ps": ["localhost:%s" % ps_port] } cs = server_lib.ClusterSpec(cluster_dict) worker = server_lib.Server(cs, job_name="worker", protocol="grpc", task_index=0, start=True) unused_ps = server_lib.Server(cs, job_name="ps", protocol="grpc", task_index=0, start=True) with ops.Graph().as_default(), session.Session( target=worker.target): with ops.device("/job:worker"): t = constant_op.constant([[1.0], [2.0]]) l = list_ops.tensor_list_from_tensor(t, element_shape=[1]) with ops.device("/job:ps"): l_ps = array_ops.identity(l) l_ps, e = list_ops.tensor_list_pop_back( l_ps, element_dtype=dtypes.float32) with ops.device("/job:worker"): worker_e = array_ops.identity(e) self.assertAllEqual(self.evaluate(worker_e), [2.0])
def test_explicitly_specified_values(self): cluster_spec = { run_config_lib.TaskType.PS: ["localhost:9990"], "my_job_name": ["localhost:9991", "localhost:9992", "localhost:0"] } tf_config = { "cluster": cluster_spec, "task": { "type": run_config_lib.TaskType.WORKER, "index": 2 } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config_lib.RunConfig( master="localhost:0", evaluation_master="localhost:9991") self.assertEqual(config.master, "localhost:0") self.assertEqual(config.task_id, 2) self.assertEqual(config.num_ps_replicas, 1) self.assertEqual(config.num_worker_replicas, 0) self.assertEqual(config.cluster_spec, server_lib.ClusterSpec(cluster_spec)) self.assertEqual(config.task_type, run_config_lib.TaskType.WORKER) self.assertFalse(config.is_chief) self.assertEqual(config.evaluation_master, "localhost:9991")
def cluster_spec(self): """Returns a ClusterSpec object based on the latest TPU information. We retrieve the information from the GCE APIs every time this method is called. Returns: A ClusterSpec containing host information returned from Cloud TPUs. Raises: RuntimeError: If the provided TPU is not healthy. """ ############################################################################ # There are 5 potential cases this code must handle: # 1. [Normal case.] We should resolve the TPU name to a set of tasks, and # a. Create a ClusterSpec that includes the coordinator job # b. Create a ClusterSpec without the coordinator job. # 2. [GKE / No API Access.] We should not resolve the TPU name to a set of # tasks and # a. Create a ClusterSpec with the coordinator # b. Create a ClusterSpec without the coordinator # 3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec. ############################################################################ if self._should_resolve(): # Case 1. response = self._fetch_cloud_tpu_metadata() # pylint: disable=protected-access if 'state' in response and response['state'] != 'READY': raise RuntimeError( 'TPU "%s" is not yet ready; state: "%s"' % (compat.as_text(self._tpu), response['state'])) if 'networkEndpoints' in response: worker_list = [ '%s:%s' % (endpoint['ipAddress'], endpoint['port']) for endpoint in response['networkEndpoints'] ] else: # Fall back to the deprecated response format instance_url = '%s:%s' % (response['ipAddress'], response['port']) worker_list = [instance_url] cluster_spec = {self.task_type: worker_list} else: if self.rpc_layer is None: # Case 3. return None # Case 2. tpus = [] for tpu in compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR): # We are working around the fact that GKE environment variable that is # supplied to us has the protocol string embedded in it, but we want # to strip it out for the ClusterSpec. if (self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://')): tpus.append(tpu[len(self.rpc_layer + '://'):]) else: tpus.append(tpu) cluster_spec = {self.task_type: tpus} if self._coordinator_address: # {1, 2}.a cluster_spec[self._coordinator_name] = [self._coordinator_address] return server_lib.ClusterSpec(cluster_spec)
def replica_device_setter(ps_tasks=0, ps_device="/job:ps", worker_device="/job:worker", merge_devices=True, cluster=None, ps_ops=None, ps_strategy=None): """Return a `device function` to use when building a Graph for replicas. Device Functions are used in `with tf.device(device_function):` statement to automatically assign devices to `Operation` objects as they are constructed, Device constraints are added from the inner-most context first, working outwards. The merging behavior adds constraints to fields that are yet unset by a more inner context. Currently the fields are (job, task, cpu/gpu). If `cluster` is `None`, and `ps_tasks` is 0, the returned function is a no-op. Otherwise, the value of `ps_tasks` is derived from `cluster`. By default, only Variable ops are placed on ps tasks, and the placement strategy is round-robin over all ps tasks. A custom `ps_strategy` may be used to do more intelligent placement, such as `tf.contrib.training.GreedyLoadBalancingStrategy`. For example, ```python # To build a cluster with two ps jobs on hosts ps0 and ps1, and 3 worker # jobs on hosts worker0, worker1 and worker2. cluster_spec = { "ps": ["ps0:2222", "ps1:2222"], "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]} with tf.device(tf.replica_device_setter(cluster=cluster_spec)): # Build your graph v1 = tf.Variable(...) # assigned to /job:ps/task:0 v2 = tf.Variable(...) # assigned to /job:ps/task:1 v3 = tf.Variable(...) # assigned to /job:ps/task:0 # Run compute ``` Args: ps_tasks: Number of tasks in the `ps` job. Ignored if `cluster` is provided. ps_device: String. Device of the `ps` job. If empty no `ps` job is used. Defaults to `ps`. worker_device: String. Device of the `worker` job. If empty no `worker` job is used. merge_devices: `Boolean`. If `True`, merges or only sets a device if the device constraint is completely unset. merges device specification rather than overriding them. cluster: `ClusterDef` proto or `ClusterSpec`. ps_ops: List of strings representing `Operation` types that need to be placed on `ps` devices. If `None`, defaults to `["Variable"]`. ps_strategy: A callable invoked for every ps `Operation` (i.e. matched by `ps_ops`), that takes the `Operation` and returns the ps task index to use. If `None`, defaults to a round-robin strategy across all `ps` devices. Returns: A function to pass to `tf.device()`. Raises: TypeError if `cluster` is not a dictionary or `ClusterDef` protocol buffer, or if `ps_strategy` is provided but not a callable. """ if cluster is not None: if isinstance(cluster, server_lib.ClusterSpec): cluster_spec = cluster.as_dict() else: cluster_spec = server_lib.ClusterSpec(cluster).as_dict() # Get ps_job_name from ps_device by striping "/job:". ps_job_name = pydev.DeviceSpec.from_string(ps_device).job if ps_job_name not in cluster_spec or cluster_spec[ps_job_name] is None: return None ps_tasks = len(cluster_spec[ps_job_name]) if ps_tasks == 0: return None if ps_ops is None: # TODO(sherrym): Variables in the LOCAL_VARIABLES collection should not be # placed in the parameter server. ps_ops = ["Variable"] if not merge_devices: logging.warning( "DEPRECATION: It is recommended to set merge_devices=true in " "replica_device_setter") if ps_strategy is None: ps_strategy = _RoundRobinStrategy(ps_tasks) if not six.callable(ps_strategy): raise TypeError("ps_strategy must be callable") chooser = _ReplicaDeviceChooser(ps_tasks, ps_device, worker_device, merge_devices, ps_ops, ps_strategy) return chooser.device_function
def cluster_spec(self): """Returns a ClusterSpec object based on the latest TPU information. We retrieve the information from the GCE APIs every time this method is called. Returns: A ClusterSpec containing host information returned from Cloud TPUs. Raises: RuntimeError: If the provided TPU is not healthy. """ ############################################################################ # There are 5 potential cases this code must handle: # 1. [Normal case.] We should resolve the TPU name to a set of tasks, and # a. Create a ClusterSpec that includes the coordinator job # b. Create a ClusterSpec without the coordinator job. # 2. [GKE / No API Access.] We should not resolve the TPU name to a set of # tasks and # a. Create a ClusterSpec with the coordinator # b. Create a ClusterSpec without the coordinator # 3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec. ############################################################################ if self._shouldResolve(): # Case 1. full_name = 'projects/%s/locations/%s/nodes/%s' % ( self._project, self._zone, compat.as_text(self._tpu)) request = self._service.projects().locations().nodes().get( name=full_name) response = request.execute() if 'state' in response and response['state'] != 'READY': raise RuntimeError( 'TPU "%s" is not yet ready; state: "%s"' % (compat.as_text(self._tpu), response['state'])) if 'health' in response and response['health'] != 'HEALTHY': raise RuntimeError( 'TPU "%s" is unhealthy: "%s"' % (compat.as_text(self._tpu), response['health'])) if 'networkEndpoints' in response: worker_list = [ '%s:%s' % (endpoint['ipAddress'], endpoint['port']) for endpoint in response['networkEndpoints'] ] else: # Fall back to the deprecated response format instance_url = '%s:%s' % (response['ipAddress'], response['port']) worker_list = [instance_url] cluster_spec = {self._job_name: worker_list} else: if not self._tpu.startswith(compat.as_bytes('grpc://')): # Case 3. return None # Case 2. cluster_spec = { self._job_name: [ x[len(compat.as_bytes('grpc://')):] for x in self._tpu.split( compat.as_bytes(_ENDPOINTS_SEPARATOR)) ] } if self._coordinator_address: # {1, 2}.a cluster_spec[self._coordinator_name] = [self._coordinator_address] return server_lib.ClusterSpec(cluster_spec)
def _init_distributed_setting_from_environment_var(self, tf_config): """Initialize distributed properties based on `tf_config`.""" self._service = _validate_service(tf_config.get(_SERVICE_KEY)) self._cluster_spec = server_lib.ClusterSpec( tf_config.get(_CLUSTER_KEY, {})) task_env = tf_config.get(_TASK_ENV_KEY, {}) if self._cluster_spec and TaskType.MASTER in self._cluster_spec.jobs: return self._init_distributed_setting_from_environment_var_with_master( tf_config) if self._cluster_spec: # Distributed mode. self._task_type, self._task_id = _validate_task_type_and_task_id( self._cluster_spec, task_env, TaskType.CHIEF) self._evaluation_master = _get_eval_session_master( self._task_type, tf_config) if self._task_type != TaskType.EVALUATOR: self._master = _get_session_master(self._cluster_spec, self._task_type, self._task_id, tf_config) self._num_ps_replicas = _count_ps(self._cluster_spec) self._num_worker_replicas = _count_worker( self._cluster_spec, chief_task_type=TaskType.CHIEF) self._global_id_in_cluster = _get_global_id_in_cluster( self._cluster_spec, self._task_type, self._task_id, chief_task_type=TaskType.CHIEF) else: # Evaluator is not part of the training cluster. self._cluster_spec = server_lib.ClusterSpec({}) self._master = _LOCAL_MASTER self._num_ps_replicas = 0 self._num_worker_replicas = 0 self._global_id_in_cluster = None # undefined self._is_chief = self._task_type == TaskType.CHIEF else: # Local mode. self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER) self._task_id = int(task_env.get(_TASK_ID_KEY, 0)) self._global_id_in_cluster = 0 if self._task_type != TaskType.WORKER: raise ValueError( 'If "cluster" is not set in TF_CONFIG, task type must be WORKER.' ) if self._task_id != 0: raise ValueError( 'If "cluster" is not set in TF_CONFIG, task index must be 0.' ) self._master = tf_config.get(_SESSION_MASTER_KEY, _LOCAL_MASTER) self._evaluation_master = tf_config.get(_EVAL_SESSION_MASTER_KEY, _LOCAL_MASTER) self._is_chief = True self._num_ps_replicas = 0 self._num_worker_replicas = 1
def _create_cluster(num_workers, num_ps, has_chief=False, has_eval=False, protocol='grpc', worker_config=None, ps_config=None, eval_config=None, worker_name='worker', ps_name='ps', chief_name='chief'): """Creates and starts local servers and returns the cluster_spec dict.""" if _portpicker_import_error: raise _portpicker_import_error # pylint: disable=raising-bad-type worker_ports = [pick_unused_port() for _ in range(num_workers)] ps_ports = [pick_unused_port() for _ in range(num_ps)] cluster_dict = {} if num_workers > 0: cluster_dict[worker_name] = [ 'localhost:%s' % port for port in worker_ports ] if num_ps > 0: cluster_dict[ps_name] = ['localhost:%s' % port for port in ps_ports] if has_eval: cluster_dict['evaluator'] = ['localhost:%s' % pick_unused_port()] if has_chief: cluster_dict[chief_name] = ['localhost:%s' % pick_unused_port()] cs = server_lib.ClusterSpec(cluster_dict) for i in range(num_workers): server_lib.Server(cs, job_name=worker_name, protocol=protocol, task_index=i, config=worker_config, start=True) for i in range(num_ps): server_lib.Server(cs, job_name=ps_name, protocol=protocol, task_index=i, config=ps_config, start=True) if has_chief: server_lib.Server(cs, job_name=chief_name, protocol=protocol, task_index=0, config=worker_config, start=True) if has_eval: server_lib.Server(cs, job_name='evaluator', protocol=protocol, task_index=0, config=eval_config, start=True) return cluster_dict
def testNonEmptyClusterSpecIsTrue(self): self.assertTrue(server_lib.ClusterSpec({"job": ["host:port"]}))
def _get_distribution_strategy(self): return multi_worker_strategy.MultiWorkerMirroredStrategy( cluster=server_lib.ClusterSpec({ 'worker': ['/job:worker/task:0', '/job:worker/task:1'] }), num_gpus_per_worker=context.num_gpus())
def _configure_distribution_strategy(self, distribution): cluster_spec = server_lib.ClusterSpec({ "worker": ["/job:worker/task:0", "/job:worker/task:1"] }) distribution.configure(cluster_spec=cluster_spec)
def __init__(self, num_gpus_per_worker=1, worker_job_name=None, num_workers=None, cluster=None, cross_tower_ops=None, prefetch_on_device=None): """Initialize the strategy object. Args: num_gpus_per_worker: number of GPUs per work. If it is zero, the local CPU will be used. worker_job_name: the job name for `worker`, typically just 'worker'. num_workers: the number of workers. If it is 0, it regenerates to single-worker MirroredStrategy. cluster: a `tf.train.ClusterSpec` object or a dict that can be used to construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef` proto buffer. It is an alternative way to initialize this object. cross_tower_ops: the cross tower ops to use. If None, a default one will be used. If configure method is called, a best one for the configuration will be chosen. prefetch_on_device: a boolean to specify whether to prefetech input to each worker's devices. Raises: ValueError: if got an unexpected `cluster`. """ if cluster is None: self._workers = [ '/job:%s/task:%d' % (worker_job_name, task_index) for task_index in range(num_workers) ] else: if isinstance(cluster, (dict, cluster_pb2.ClusterDef)): cluster_spec = server_lib.ClusterSpec(cluster) elif isinstance(cluster, server_lib.ClusterSpec): cluster_spec = cluster else: raise ValueError( "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " '`tf.train.ClusterDef` object') self._workers = [] for job in sorted(cluster_spec.jobs): for task in range(cluster_spec.num_tasks(job)): self._workers.append('/job:%s/task:%d' % (job, task)) self._num_gpus_per_worker = num_gpus_per_worker if num_gpus_per_worker > 0: self._worker_device_map = { worker: [ device_util.canonicalize(worker + '/device:GPU:%d' % gpu) for gpu in range(num_gpus_per_worker) ] for worker in self._workers } else: self._worker_device_map = { worker: [device_util.canonicalize(worker, '/device:CPU:0')] for worker in self._workers } self._devices = nest.flatten(self._worker_device_map) super(MultiWorkerMirroredStrategy, self).__init__( devices=self._devices, prefetch_on_device=prefetch_on_device) # Setting `_default_device` will add a device scope in the # distribution.scope. We set the default device to the first worker. When # users specify device under distribution.scope by # with tf.device("/cpu:0"): # ... # their ops will end up on the cpu device of its first worker, e.g. # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode. self._default_device = self._workers[0]
def __init__(self, master=None, evaluation_master=None): """Constructor. Sets the properties `cluster_spec`, `is_chief`, `master` (if `None` in the args), `num_ps_replicas`, `task_id`, and `task_type` based on the `TF_CONFIG` environment variable, if the pertinent information is present. The `TF_CONFIG` environment variable is a JSON object with attributes: `cluster`, `environment`, and `task`. `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from `server_lib.py`, mapping task types (usually one of the TaskType enums) to a list of task addresses. `environment` specifies the runtime environment for the job (usually one of the `Environment` enums). Defaults to `LOCAL`. `task` has two attributes: `type` and `index`, where `type` can be any of the task types in `cluster`. When `TF_CONFIG` contains said information, the following properties are set on this class: * `task_type` is set to `TF_CONFIG['task']['type']`. Defaults to `None`. * `task_id` is set to `TF_CONFIG['task']['index']`. Defaults to 0. * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. * `master` is determined by looking up `task_type` and `task_id` in the `cluster_spec`. Defaults to ''. * `num_ps_replicas` is set by counting the number of nodes listed in the `ps` attribute of `cluster_spec`. Defaults to 0. * `num_worker_replicas` is set by counting the number of nodes listed in the `worker` attribute of `cluster_spec`. Defaults to 0. * `is_chief` is deteremined based on `task_type`, `type_id`, and `environment`. Example: ``` cluster = {'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'worker', 'index': 1}}) config = ClusterConfig() assert config.master == 'host4:2222' assert config.task_id == 1 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 3 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'worker' assert not config.is_chief ``` Args: master: TensorFlow master. Defaults to empty string for local. evaluation_master: The master on which to perform evaluation. """ # If not explicitly specified in the constructor and the TF_CONFIG # environment variable is present, load cluster_spec from TF_CONFIG. config = json.loads(os.environ.get('TF_CONFIG') or '{}') # Set task_type and task_id if the TF_CONFIG environment variable is # present. Otherwise, use the respective default (None / 0). task_env = config.get('task', {}) self._task_type = task_env.get('type', None) self._task_id = self.get_task_id() self._cluster_spec = server_lib.ClusterSpec(config.get('cluster', {})) self._master = (master if master is not None else _get_master( self._cluster_spec, self._task_type, self._task_id) or '') self._num_ps_replicas = _count_ps(self._cluster_spec) or 0 self._num_worker_replicas = _count_worker(self._cluster_spec) or 0 # Set is_chief. self._environment = config.get('environment', Environment.LOCAL) self._is_chief = None if self._task_type is None: self._is_chief = (self._task_id == 0) elif self._environment == Environment.CLOUD: # When the TF_CONFIG environment variable is set, we can set the # default of is_chief to 0 when task_type is "master" and task_id is 0. self._is_chief = (self._task_type == TaskType.MASTER and self._task_id == 0) else: # Legacy behavior is that is_chief is None if task_id == 0. self._is_chief = (self._task_type == TaskType.WORKER and self._task_id == 0) self._evaluation_master = evaluation_master or ''
def create_local_cluster(num_workers, num_ps, protocol="grpc", worker_config=None, ps_config=None): """Create and start local servers and return the associated `Server` objects. Example: ```python workers, _ = tf.test.create_local_cluster(num_workers=2, num_ps=2) worker_sessions = [tf.Session(w.target) for w in workers] with tf.device("/job:ps/task:0"): ... with tf.device("/job:ps/task:1"): ... with tf.device("/job:worker/task:0"): ... with tf.device("/job:worker/task:1"): ... worker_sessions[0].run(...) ``` Args: num_workers: Number of worker servers to start. num_ps: Number of PS servers to start. protocol: Communication protocol. Allowed values are documented in the documentation of `tf.train.Server`. worker_config: (optional) ConfigProto to initialize workers. Can be used to instantiate multiple devices etc. ps_config: (optional) ConfigProto to initialize PS servers. Returns: A tuple `(worker_servers, ps_servers)`. `worker_servers` is a list of `num_workers` objects of type `tf.train.Server` (all running locally); and `ps_servers` is a list of `num_ps` objects of similar type. Raises: ImportError: if portpicker module was not found at load time """ if _portpicker_import_error: raise _portpicker_import_error # pylint: disable=raising-bad-type worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)] ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)] cluster_dict = { "worker": ["localhost:%s" % port for port in worker_ports], "ps": ["localhost:%s" % port for port in ps_ports] } cs = server_lib.ClusterSpec(cluster_dict) workers = [ server_lib.Server(cs, job_name="worker", protocol=protocol, task_index=ix, config=worker_config, start=True) for ix in range(num_workers) ] ps_servers = [ server_lib.Server(cs, job_name="ps", protocol=protocol, task_index=ix, config=ps_config, start=True) for ix in range(num_ps) ] return workers, ps_servers
def __init__(self, iterations_per_loop=1, profiling_config=None, model_dir=None, tf_random_seed=None, save_summary_steps=0, save_checkpoints_steps=None, save_checkpoints_secs=None, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, distribute=None, enable_data_pre_proc=True, precision_mode=None, enable_reduce_precision=False, variable_format_optimize=True, mix_compile_mode=False, hcom_parallel=False, graph_memory_max_size=None, variable_memory_max_size=None, auto_tune_mode=None, dump_config=None, stream_max_parallel_num=None, is_tailing_optimization=False, horovod_mode=False, graph_run_mode=1, op_debug_level=0, enable_scope_fusion_passes=None, enable_exception_dump=0, op_select_implmode=None, optypelist_for_implmode=None, dynamic_input_config=None, mstune_mode=None, work_path=None, buffer_optimize="l2_optimize", enable_small_channel=0, fusion_switch_file=None, enable_compress_weight=False, compress_weight_conf=None, op_compiler_cache_mode=None, op_compiler_cache_dir=None, debug_dir=None, hcom_multi_mode=False, dynamic_input=False, dynamic_graph_execute_mode="dynamic_execute", dynamic_inputs_shape_range=None, train_distribute=None, eval_distribute=None, local_rank_id=None, local_device_list=None, session_device_id=None, distribute_config=None, op_tune_mode=None): """ Constructs a NPUConfig. Args: iterations_per_loop: This is the number of train steps running in NPU system before returning to CPU host for each `Session.run`. This means global step is increased `iterations_per_loop` times in one `Session.run`. It is recommended to be set as number of global steps for next checkpoint. profiling_config: The profiling configuration. model_dir: Directory where model parameters, graph, etc are saved. If `PathLike` object, the path will be resolved. If `None`, will use a default value set by the Estimator. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. Defaults to 600 seconds if both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in constructor. If both `save_checkpoints_steps` and `save_checkpoints_secs` are None, then checkpoints are disabled. session_config: A ConfigProto used to set session parameters, or None. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If None or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step/sec and the loss will be logged during training. enabel_data_pre_proc: This is the switch of data preprocess. precision_mode: if train, default is: allow_fp32_to_fp16; if inference, default is: force_fp16. variable_format_optimize: enable or disable variable format optimize while graph engineer optimize process. mix_compile_mode: This is the swith of mix_compile_mode. When the value is False, all graphs run on device. Otherwise, some graphs run on host. hcom_parallel: This is the switch of hcom parallel. When the value is True, hcom will execute with parallel mode. Otherwise, hcom will execute with serialize mode. graph_memory_max_size: The max size of ge graph memory size. variable_memory_max_size: The max size of ge variable memory size. auto_tune_mode: None, or `GA` ,or `RL` or `GA|RL` dump_config: The dump configuration. stream_max_parallel_num: Specify the degree of parallelism of the AICPU / AICORE engine to achieve parallel execution between AICPU / AICORE operators. op_select_implmode: Selecting whether the operator is implemented with high precision or high performance. optypelist_for_implmode: Operator list. dynamic_input_config:Dynamic dims configuration mstune_mode: Optimization Task Type."1": model tune; "2": optune; "3": model tune & optune; "4": gradient split tune. work_path: Stores temporary files generated during optimization. buffer_optimize: Whether to enable buffer optimization. enable_small_channel: Whether to enable small channel optimization. fusion_switch_file: Fusion switch configuration file path. enable_compress_weight: Whether to enable global weight compression. compress_weight_conf:Path and file name of the node list configuration file to be compressed. dynamic_input:Whether Input is dynamic. dynamic_graph_execute_mode:Dynamic graph execute mode. lazy_recompile or dynamic_execute dynamic_inputs_shape_range:Inputs shape range. local_rank_id: Local sequence number of the device in a group. local_device_list: Available devices. distribute_config: Specify the NCA configuration file path op_tune_mode: None, or `GA` ,or `RL` or `GA|RL`, use with mstune_mode. """ # Check iterations_per_loop. util.check_positive_integer(iterations_per_loop, "iterations_per_loop") if isinstance(mix_compile_mode, bool) == False: raise ValueError('"mix_compile_mode" type must be bool') if mix_compile_mode is True and iterations_per_loop != 1: raise ValueError( '"iterations_per_loop" must be 1 with "mix_compile_mode" is True' ) tf_config = json.loads( os.environ.get(run_config_lib._TF_CONFIG_ENV, '{}')) tmp_cluster_spec = server_lib.ClusterSpec( tf_config.get(run_config_lib._CLUSTER_KEY, {})) if ((tmp_cluster_spec and not isinstance(distribute, ParameterServerStrategy)) or (not tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy))): raise ValueError( '"cluster" and "distribute" must all be set in ps mode') if tmp_cluster_spec and mix_compile_mode is False: raise ValueError( '"mix_compile_mode" can only be True with "cluster" is set') self.iterations_per_loop = iterations_per_loop self.mix_compile_mode = mix_compile_mode self.enable_data_pre_proc = enable_data_pre_proc self.is_tailing_optimization = is_tailing_optimization if save_checkpoints_secs == None and save_checkpoints_steps == None: save_checkpoints_steps = 100 self._profiling_config = profiling_config # mix precision configuration self._precision_mode = precision_mode self._enable_reduce_precision = enable_reduce_precision self._variable_format_optimize = variable_format_optimize self._hcom_parallel = hcom_parallel self._graph_memory_max_size = graph_memory_max_size self._variable_memory_max_size = variable_memory_max_size self._auto_tune_mode = auto_tune_mode if dump_config is not None and not isinstance(dump_config, DumpConfig): raise ValueError( '`dump_config` must be provided with type `DumpConfig`') self._dump_config = dump_config self._stream_max_parallel_num = stream_max_parallel_num if isinstance(horovod_mode, bool) == False: raise ValueError('"horovod_mode" type must be bool') self.horovod_mode = horovod_mode util.check_nonnegative_integer(graph_run_mode, "graph_run_mode") if graph_run_mode > 1: raise ValueError('"graph_run_mode" value must be 0 or 1') self.graph_run_mode = graph_run_mode self.op_debug_level = op_debug_level self.enable_scope_fusion_passes = enable_scope_fusion_passes experimental_distribute = None if tmp_cluster_spec and isinstance(distribute, ParameterServerStrategy): experimental_distribute = DistributeConfig(distribute, distribute, None) util.check_nonnegative_integer(enable_exception_dump, "enable_exception_dump") self.enable_exception_dump = enable_exception_dump self._op_select_implmode = op_select_implmode self._optypelist_for_implmode = optypelist_for_implmode if dynamic_input_config is not None and not isinstance( dynamic_input_config, DynamicInputConfig): raise ValueError( 'dynamic_input_config must be provided with type DynamicInputConfig' ) self._dynamic_input_config = dynamic_input_config self._mstune_mode = mstune_mode self._work_path = work_path self._buffer_optimize = buffer_optimize self._enable_small_channel = enable_small_channel self._fusion_switch_file = fusion_switch_file self._enable_compress_weight = enable_compress_weight self._compress_weight_conf = compress_weight_conf self._op_compiler_cache_mode = op_compiler_cache_mode self._op_compiler_cache_dir = op_compiler_cache_dir self._debug_dir = debug_dir self._hcom_multi_mode = hcom_multi_mode self._dynamic_input = dynamic_input self._dynamic_graph_execute_mode = dynamic_graph_execute_mode self._dynamic_inputs_shape_range = dynamic_inputs_shape_range self._local_rank_id = local_rank_id self._local_device_list = local_device_list self._session_device_id = session_device_id self._distribute_config = distribute_config self._op_tune_mode = op_tune_mode super(NPURunConfig, self).__init__( model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, experimental_distribute=experimental_distribute, train_distribute=train_distribute, eval_distribute=eval_distribute)