def _mapfn(iter): import pyspark # Note: consuming the input iterator helps Pyspark re-use this worker, for i in iter: executor_id = i # check that there are enough available GPUs (if using tensorflow-gpu) before committing reservation on this node # note: for Spark 3+ w/ GPU allocation, the required number of GPUs should be guaranteed by the resource manager if version.parse(pyspark.__version__).base_version < version.parse( '3.0.0').base_version: if gpu_info.is_gpu_available(): num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1 gpus_to_use = gpu_info.get_gpus(num_gpus) # assign TF job/task based on provided cluster_spec template (or use default/null values) job_name = 'default' task_index = -1 cluster_id = cluster_meta['id'] cluster_template = cluster_meta['cluster_template'] for jobtype in cluster_template: nodes = cluster_template[jobtype] if executor_id in nodes: job_name = jobtype task_index = nodes.index(executor_id) break # get unique key (hostname, executor_id) for this executor host = util.get_ip_address() util.write_executor_id(executor_id) port = 0 # check for existing TFManagers if TFSparkNode.mgr is not None and str( TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor raise Exception( "TFManager already started on {0}, executor={1}, state={2}" .format(host, executor_id, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logger.warn( "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}" .format(TFSparkNode.cluster_id, cluster_id)) # start a TFManager and get a free port # use a random uuid as the authkey authkey = uuid.uuid4().bytes addr = None if job_name in ('ps', 'evaluator'): # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) else: # worker nodes only need to be locally accessible within the executor for data feeding TFSparkNode.mgr = TFManager.start(authkey, queues) addr = TFSparkNode.mgr.address # initialize mgr state TFSparkNode.mgr.set('state', 'running') TFSparkNode.cluster_id = cluster_id # expand Hadoop classpath wildcards for JNI (Spark 2.x) if 'HADOOP_PREFIX' in os.environ: classpath = os.environ['CLASSPATH'] hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin', 'hadoop') hadoop_classpath = subprocess.check_output( [hadoop_path, 'classpath', '--glob']).decode() logger.debug("CLASSPATH: {0}".format(hadoop_classpath)) os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath # start TensorBoard if requested, on 'worker:0' if available (for backwards-compatibility), otherwise on 'chief:0' or 'master:0' job_names = sorted([ k for k in cluster_template.keys() if k in ['chief', 'master', 'worker'] ]) tb_job_name = 'worker' if 'worker' in job_names else job_names[0] tb_pid = 0 tb_port = 0 if tensorboard and job_name == tb_job_name and task_index == 0: tb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_sock.bind(('', 0)) tb_port = tb_sock.getsockname()[1] tb_sock.close() logdir = log_dir if log_dir else "tensorboard_%d" % executor_id # search for tensorboard in python/bin, PATH, and PYTHONPATH pypath = sys.executable pydir = os.path.dirname(pypath) sys_path = os.pathsep.join(sys.path) search_path = os.pathsep.join([ pydir, sys_path, os.environ['PATH'], os.environ['PYTHONPATH'] ]) tb_path = util.find_in_path(search_path, 'tensorboard') # executable in PATH if not tb_path: tb_path = util.find_in_path(search_path, 'tensorboard/main.py') # TF 1.3+ if not tb_path: tb_path = util.find_in_path( search_path, 'tensorflow/tensorboard/__main__.py') # TF 1.2- if not tb_path: raise Exception( "Unable to find 'tensorboard' in: {}".format(search_path)) # launch tensorboard if version.parse(TF_VERSION) >= version.parse('2.0.0'): tb_proc = subprocess.Popen([ pypath, tb_path, "--reload_multifile=True", "--logdir=%s" % logdir, "--port=%d" % tb_port ], env=os.environ) else: tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port ], env=os.environ) tb_pid = tb_proc.pid # check server to see if this task is being retried (i.e. already reserved) client = reservation.Client(cluster_meta['server_addr']) cluster_info = client.get_reservations() tmp_sock = None node_meta = None for node in cluster_info: (nhost, nexec) = (node['host'], node['executor_id']) if nhost == host and nexec == executor_id: node_meta = node port = node['port'] # if not already done, register everything we need to set up the cluster if node_meta is None: # first, find a free port for TF tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tmp_sock.bind(('', port)) port = tmp_sock.getsockname()[1] node_meta = { 'executor_id': executor_id, 'host': host, 'job_name': job_name, 'task_index': task_index, 'port': port, 'tb_pid': tb_pid, 'tb_port': tb_port, 'addr': addr, 'authkey': authkey } # register node metadata with server logger.info("TFSparkNode.reserve: {0}".format(node_meta)) client.register(node_meta) # wait for other nodes to finish reservations cluster_info = client.await_reservations() client.close() # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) cluster_spec = {} last_executor_id = -1 for node in sorted_cluster_info: if (node['executor_id'] == last_executor_id): raise Exception("Duplicate worker/task in cluster_info") last_executor_id = node['executor_id'] logger.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in cluster_spec else cluster_spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) cluster_spec[njob] = hosts # update TF_CONFIG if cluster spec has a 'master' node (i.e. tf.estimator) if 'master' in cluster_spec or 'chief' in cluster_spec: tf_config = json.dumps({ 'cluster': cluster_spec, 'task': { 'type': job_name, 'index': task_index }, 'environment': 'cloud' }) logger.info("export TF_CONFIG: {}".format(tf_config)) os.environ['TF_CONFIG'] = tf_config # reserve GPU(s) again, just before launching TF process (in case situation has changed) # and setup CUDA_VISIBLE_DEVICES accordingly if gpu_info.is_gpu_available(): gpus_to_use = None # For Spark 3+, try to get GPU resources from TaskContext first if version.parse( pyspark.__version__).base_version >= version.parse( "3.0.0").base_version: from pyspark import TaskContext context = TaskContext() if 'gpu' in context.resources(): # use ALL GPUs assigned by resource manager gpus = context.resources()['gpu'].addresses num_gpus = len(gpus) gpus_to_use = ','.join(gpus) if not gpus_to_use: # compute my index relative to other nodes on the same host (for GPU allocation) my_addr = cluster_spec[job_name][task_index] my_host = my_addr.split(':')[0] flattened = [ v for sublist in cluster_spec.values() for v in sublist ] local_peers = [p for p in flattened if p.startswith(my_host)] my_index = local_peers.index(my_addr) # default to one GPU if not specified explicitly num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1 gpus_to_use = gpu_info.get_gpus(num_gpus, my_index) gpu_str = "GPUs" if num_gpus > 1 else "GPU" logger.info( "Requested {} {}, setting CUDA_VISIBLE_DEVICES={}".format( num_gpus, gpu_str, gpus_to_use)) os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use # create a context object to hold metadata for TF ctx = TFNodeContext(executor_id, job_name, task_index, cluster_spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) # release port reserved for TF as late as possible if tmp_sock is not None: tmp_sock.close() # Background mode relies reuse of python worker in Spark. if background: # However, reuse of python worker can't work on Windows, we need to check if the current # script runs on Windows or not. if os.name == 'nt' or platform.system() == 'Windows': raise Exception("Background mode is not supported on Windows.") # Check if the config of reuse python worker is enabled on Spark. if not os.environ.get("SPARK_REUSE_WORKER"): raise Exception( "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background." ) def wrapper_fn(args, context): """Wrapper function that sets the sys.argv of the executor.""" if isinstance(args, list): sys.argv = args fn(args, context) def wrapper_fn_background(args, context): """Wrapper function that signals exceptions to foreground process.""" errq = TFSparkNode.mgr.get_queue('error') try: wrapper_fn(args, context) except Exception: errq.put(traceback.format_exc()) if job_name in ('ps', 'evaluator') or background: # invoke the TensorFlow main function in a background thread logger.info( "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process" .format(job_name, task_index, job_name, executor_id)) p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name in ('ps', 'evaluator'): p.daemon = True p.start() # for ps and evaluator nodes, wait indefinitely in foreground thread for a "control" event (None == "stop") if job_name in ('ps', 'evaluator'): queue = TFSparkNode.mgr.get_queue('control') equeue = TFSparkNode.mgr.get_queue('error') done = False while not done: while (queue.empty() and equeue.empty()): time.sleep(1) if (not equeue.empty()): e_str = equeue.get() raise Exception("Exception in " + job_name + ":\n" + e_str) msg = queue.get(block=True) logger.info("Got msg: {0}".format(msg)) if msg is None: logger.info("Terminating {}".format(job_name)) TFSparkNode.mgr.set('state', 'stopped') done = True queue.task_done() else: # otherwise, just run TF function in the main executor/worker thread logger.info( "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread" .format(job_name, task_index, executor_id)) wrapper_fn(tf_args, ctx) logger.info( "Finished TensorFlow {0}:{1} on cluster node {2}".format( job_name, task_index, executor_id))
def _get_gpus(cluster_spec=None): gpus = [] is_k8s = 'SPARK_EXECUTOR_POD_IP' in os.environ # handle explicitly configured tf_args.num_gpus if 'num_gpus' in tf_args: requested_gpus = tf_args.num_gpus user_requested = True else: requested_gpus = 0 user_requested = False # first, try Spark 3 resources API, returning all visible GPUs # note: num_gpus arg is only used (if supplied) to limit/truncate visible devices if _has_spark_resource_api(): from pyspark import TaskContext context = TaskContext() resources = context.resources() if resources and 'gpu' in resources: # get all GPUs assigned by resource manager gpus = context.resources()['gpu'].addresses logger.info("Spark gpu resources: {}".format(gpus)) if user_requested: if requested_gpus < len(gpus): # override/truncate list, if explicitly configured logger.warn( "Requested {} GPU(s), but {} available".format( requested_gpus, len(gpus))) gpus = gpus[:requested_gpus] else: # implicitly requested by Spark 3 requested_gpus = len(gpus) # if not in K8s pod and GPUs available, just use original allocation code (defaulting to 1 GPU if available) # Note: for K8s, there is a bug with the Nvidia device_plugin which can show GPUs for non-GPU pods # that are hosted on GPU nodes if not is_k8s and gpu_info.is_gpu_available() and not gpus: # default to one GPU if not specified explicitly requested_gpus = max( 1, requested_gpus) if not user_requested else requested_gpus if requested_gpus > 0: if cluster_spec: # compute my index relative to other nodes on the same host (for GPU allocation) my_addr = cluster_spec[job_name][task_index] my_host = my_addr.split(':')[0] flattened = [ v for sublist in cluster_spec.values() for v in sublist ] local_peers = [ p for p in flattened if p.startswith(my_host) ] my_index = local_peers.index(my_addr) else: my_index = 0 # try to allocate a GPU gpus = gpu_info.get_gpus(requested_gpus, my_index, format=gpu_info.AS_LIST) if user_requested and len(gpus) < requested_gpus: raise Exception( "Unable to allocate {} GPU(s) from available GPUs: {}". format(requested_gpus, gpus)) gpus_to_use = ','.join(gpus) if gpus: logger.info( "Requested {} GPU(s), setting CUDA_VISIBLE_DEVICES={}". format(requested_gpus if user_requested else len(gpus), gpus_to_use)) os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use