Example #1
0
    def _mapfn(iter):
        import pyspark

        # Note: consuming the input iterator helps Pyspark re-use this worker,
        for i in iter:
            executor_id = i

        # check that there are enough available GPUs (if using tensorflow-gpu) before committing reservation on this node
        # note: for Spark 3+ w/ GPU allocation, the required number of GPUs should be guaranteed by the resource manager
        if version.parse(pyspark.__version__).base_version < version.parse(
                '3.0.0').base_version:
            if gpu_info.is_gpu_available():
                num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1
                gpus_to_use = gpu_info.get_gpus(num_gpus)

        # assign TF job/task based on provided cluster_spec template (or use default/null values)
        job_name = 'default'
        task_index = -1
        cluster_id = cluster_meta['id']
        cluster_template = cluster_meta['cluster_template']
        for jobtype in cluster_template:
            nodes = cluster_template[jobtype]
            if executor_id in nodes:
                job_name = jobtype
                task_index = nodes.index(executor_id)
                break

        # get unique key (hostname, executor_id) for this executor
        host = util.get_ip_address()
        util.write_executor_id(executor_id)
        port = 0

        # check for existing TFManagers
        if TFSparkNode.mgr is not None and str(
                TFSparkNode.mgr.get('state')) != "'stopped'":
            if TFSparkNode.cluster_id == cluster_id:
                # raise an exception to force Spark to retry this "reservation" task on another executor
                raise Exception(
                    "TFManager already started on {0}, executor={1}, state={2}"
                    .format(host, executor_id,
                            str(TFSparkNode.mgr.get("state"))))
            else:
                # old state, just continue with creating new manager
                logger.warn(
                    "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}"
                    .format(TFSparkNode.cluster_id, cluster_id))

        # start a TFManager and get a free port
        # use a random uuid as the authkey
        authkey = uuid.uuid4().bytes
        addr = None
        if job_name in ('ps', 'evaluator'):
            # PS nodes must be remotely accessible in order to shutdown from Spark driver.
            TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'],
                                              'remote')
            addr = (host, TFSparkNode.mgr.address[1])
        else:
            # worker nodes only need to be locally accessible within the executor for data feeding
            TFSparkNode.mgr = TFManager.start(authkey, queues)
            addr = TFSparkNode.mgr.address

        # initialize mgr state
        TFSparkNode.mgr.set('state', 'running')
        TFSparkNode.cluster_id = cluster_id

        # expand Hadoop classpath wildcards for JNI (Spark 2.x)
        if 'HADOOP_PREFIX' in os.environ:
            classpath = os.environ['CLASSPATH']
            hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin',
                                       'hadoop')
            hadoop_classpath = subprocess.check_output(
                [hadoop_path, 'classpath', '--glob']).decode()
            logger.debug("CLASSPATH: {0}".format(hadoop_classpath))
            os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath

        # start TensorBoard if requested, on 'worker:0' if available (for backwards-compatibility), otherwise on 'chief:0' or 'master:0'
        job_names = sorted([
            k for k in cluster_template.keys()
            if k in ['chief', 'master', 'worker']
        ])
        tb_job_name = 'worker' if 'worker' in job_names else job_names[0]
        tb_pid = 0
        tb_port = 0
        if tensorboard and job_name == tb_job_name and task_index == 0:
            tb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tb_sock.bind(('', 0))
            tb_port = tb_sock.getsockname()[1]
            tb_sock.close()
            logdir = log_dir if log_dir else "tensorboard_%d" % executor_id

            # search for tensorboard in python/bin, PATH, and PYTHONPATH
            pypath = sys.executable
            pydir = os.path.dirname(pypath)
            sys_path = os.pathsep.join(sys.path)
            search_path = os.pathsep.join([
                pydir, sys_path, os.environ['PATH'], os.environ['PYTHONPATH']
            ])

            tb_path = util.find_in_path(search_path,
                                        'tensorboard')  # executable in PATH
            if not tb_path:
                tb_path = util.find_in_path(search_path,
                                            'tensorboard/main.py')  # TF 1.3+
            if not tb_path:
                tb_path = util.find_in_path(
                    search_path,
                    'tensorflow/tensorboard/__main__.py')  # TF 1.2-
            if not tb_path:
                raise Exception(
                    "Unable to find 'tensorboard' in: {}".format(search_path))

            # launch tensorboard
            if version.parse(TF_VERSION) >= version.parse('2.0.0'):
                tb_proc = subprocess.Popen([
                    pypath, tb_path, "--reload_multifile=True",
                    "--logdir=%s" % logdir,
                    "--port=%d" % tb_port
                ],
                                           env=os.environ)
            else:
                tb_proc = subprocess.Popen([
                    pypath, tb_path,
                    "--logdir=%s" % logdir,
                    "--port=%d" % tb_port
                ],
                                           env=os.environ)

            tb_pid = tb_proc.pid

        # check server to see if this task is being retried (i.e. already reserved)
        client = reservation.Client(cluster_meta['server_addr'])
        cluster_info = client.get_reservations()
        tmp_sock = None
        node_meta = None
        for node in cluster_info:
            (nhost, nexec) = (node['host'], node['executor_id'])
            if nhost == host and nexec == executor_id:
                node_meta = node
                port = node['port']

        # if not already done, register everything we need to set up the cluster
        if node_meta is None:
            # first, find a free port for TF
            tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            tmp_sock.bind(('', port))
            port = tmp_sock.getsockname()[1]

            node_meta = {
                'executor_id': executor_id,
                'host': host,
                'job_name': job_name,
                'task_index': task_index,
                'port': port,
                'tb_pid': tb_pid,
                'tb_port': tb_port,
                'addr': addr,
                'authkey': authkey
            }
            # register node metadata with server
            logger.info("TFSparkNode.reserve: {0}".format(node_meta))
            client.register(node_meta)
            # wait for other nodes to finish reservations
            cluster_info = client.await_reservations()
            client.close()

        # construct a TensorFlow clusterspec from cluster_info
        sorted_cluster_info = sorted(cluster_info,
                                     key=lambda k: k['executor_id'])
        cluster_spec = {}
        last_executor_id = -1
        for node in sorted_cluster_info:
            if (node['executor_id'] == last_executor_id):
                raise Exception("Duplicate worker/task in cluster_info")
            last_executor_id = node['executor_id']
            logger.info("node: {0}".format(node))
            (njob, nhost, nport) = (node['job_name'], node['host'],
                                    node['port'])
            hosts = [] if njob not in cluster_spec else cluster_spec[njob]
            hosts.append("{0}:{1}".format(nhost, nport))
            cluster_spec[njob] = hosts

        # update TF_CONFIG if cluster spec has a 'master' node (i.e. tf.estimator)
        if 'master' in cluster_spec or 'chief' in cluster_spec:
            tf_config = json.dumps({
                'cluster': cluster_spec,
                'task': {
                    'type': job_name,
                    'index': task_index
                },
                'environment': 'cloud'
            })
            logger.info("export TF_CONFIG: {}".format(tf_config))
            os.environ['TF_CONFIG'] = tf_config

        # reserve GPU(s) again, just before launching TF process (in case situation has changed)
        # and setup CUDA_VISIBLE_DEVICES accordingly
        if gpu_info.is_gpu_available():

            gpus_to_use = None
            # For Spark 3+, try to get GPU resources from TaskContext first
            if version.parse(
                    pyspark.__version__).base_version >= version.parse(
                        "3.0.0").base_version:
                from pyspark import TaskContext
                context = TaskContext()
                if 'gpu' in context.resources():
                    # use ALL GPUs assigned by resource manager
                    gpus = context.resources()['gpu'].addresses
                    num_gpus = len(gpus)
                    gpus_to_use = ','.join(gpus)

            if not gpus_to_use:
                # compute my index relative to other nodes on the same host (for GPU allocation)
                my_addr = cluster_spec[job_name][task_index]
                my_host = my_addr.split(':')[0]
                flattened = [
                    v for sublist in cluster_spec.values() for v in sublist
                ]
                local_peers = [p for p in flattened if p.startswith(my_host)]
                my_index = local_peers.index(my_addr)

                # default to one GPU if not specified explicitly
                num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1
                gpus_to_use = gpu_info.get_gpus(num_gpus, my_index)

            gpu_str = "GPUs" if num_gpus > 1 else "GPU"
            logger.info(
                "Requested {} {}, setting CUDA_VISIBLE_DEVICES={}".format(
                    num_gpus, gpu_str, gpus_to_use))
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use

        # create a context object to hold metadata for TF
        ctx = TFNodeContext(executor_id, job_name, task_index, cluster_spec,
                            cluster_meta['default_fs'],
                            cluster_meta['working_dir'], TFSparkNode.mgr)

        # release port reserved for TF as late as possible
        if tmp_sock is not None:
            tmp_sock.close()

        # Background mode relies reuse of python worker in Spark.
        if background:
            # However, reuse of python worker can't work on Windows, we need to check if the current
            # script runs on Windows or not.
            if os.name == 'nt' or platform.system() == 'Windows':
                raise Exception("Background mode is not supported on Windows.")
            # Check if the config of reuse python worker is enabled on Spark.
            if not os.environ.get("SPARK_REUSE_WORKER"):
                raise Exception(
                    "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background."
                )

        def wrapper_fn(args, context):
            """Wrapper function that sets the sys.argv of the executor."""
            if isinstance(args, list):
                sys.argv = args
            fn(args, context)

        def wrapper_fn_background(args, context):
            """Wrapper function that signals exceptions to foreground process."""
            errq = TFSparkNode.mgr.get_queue('error')
            try:
                wrapper_fn(args, context)
            except Exception:
                errq.put(traceback.format_exc())

        if job_name in ('ps', 'evaluator') or background:
            # invoke the TensorFlow main function in a background thread
            logger.info(
                "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process"
                .format(job_name, task_index, job_name, executor_id))

            p = multiprocessing.Process(target=wrapper_fn_background,
                                        args=(tf_args, ctx))
            if job_name in ('ps', 'evaluator'):
                p.daemon = True
            p.start()

            # for ps and evaluator nodes, wait indefinitely in foreground thread for a "control" event (None == "stop")
            if job_name in ('ps', 'evaluator'):
                queue = TFSparkNode.mgr.get_queue('control')
                equeue = TFSparkNode.mgr.get_queue('error')
                done = False
                while not done:
                    while (queue.empty() and equeue.empty()):
                        time.sleep(1)
                    if (not equeue.empty()):
                        e_str = equeue.get()
                        raise Exception("Exception in " + job_name + ":\n" +
                                        e_str)
                    msg = queue.get(block=True)
                    logger.info("Got msg: {0}".format(msg))
                    if msg is None:
                        logger.info("Terminating {}".format(job_name))
                        TFSparkNode.mgr.set('state', 'stopped')
                        done = True
                    queue.task_done()
        else:
            # otherwise, just run TF function in the main executor/worker thread
            logger.info(
                "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread"
                .format(job_name, task_index, executor_id))
            wrapper_fn(tf_args, ctx)
            logger.info(
                "Finished TensorFlow {0}:{1} on cluster node {2}".format(
                    job_name, task_index, executor_id))
Example #2
0
        def _get_gpus(cluster_spec=None):
            gpus = []
            is_k8s = 'SPARK_EXECUTOR_POD_IP' in os.environ

            # handle explicitly configured tf_args.num_gpus
            if 'num_gpus' in tf_args:
                requested_gpus = tf_args.num_gpus
                user_requested = True
            else:
                requested_gpus = 0
                user_requested = False

            # first, try Spark 3 resources API, returning all visible GPUs
            # note: num_gpus arg is only used (if supplied) to limit/truncate visible devices
            if _has_spark_resource_api():
                from pyspark import TaskContext
                context = TaskContext()
                resources = context.resources()
                if resources and 'gpu' in resources:
                    # get all GPUs assigned by resource manager
                    gpus = context.resources()['gpu'].addresses
                    logger.info("Spark gpu resources: {}".format(gpus))
                    if user_requested:
                        if requested_gpus < len(gpus):
                            # override/truncate list, if explicitly configured
                            logger.warn(
                                "Requested {} GPU(s), but {} available".format(
                                    requested_gpus, len(gpus)))
                            gpus = gpus[:requested_gpus]
                    else:
                        # implicitly requested by Spark 3
                        requested_gpus = len(gpus)

            # if not in K8s pod and GPUs available, just use original allocation code (defaulting to 1 GPU if available)
            # Note: for K8s, there is a bug with the Nvidia device_plugin which can show GPUs for non-GPU pods
            # that are hosted on GPU nodes
            if not is_k8s and gpu_info.is_gpu_available() and not gpus:
                # default to one GPU if not specified explicitly
                requested_gpus = max(
                    1,
                    requested_gpus) if not user_requested else requested_gpus
                if requested_gpus > 0:
                    if cluster_spec:
                        # compute my index relative to other nodes on the same host (for GPU allocation)
                        my_addr = cluster_spec[job_name][task_index]
                        my_host = my_addr.split(':')[0]
                        flattened = [
                            v for sublist in cluster_spec.values()
                            for v in sublist
                        ]
                        local_peers = [
                            p for p in flattened if p.startswith(my_host)
                        ]
                        my_index = local_peers.index(my_addr)
                    else:
                        my_index = 0

                    # try to allocate a GPU
                    gpus = gpu_info.get_gpus(requested_gpus,
                                             my_index,
                                             format=gpu_info.AS_LIST)

            if user_requested and len(gpus) < requested_gpus:
                raise Exception(
                    "Unable to allocate {} GPU(s) from available GPUs: {}".
                    format(requested_gpus, gpus))

            gpus_to_use = ','.join(gpus)
            if gpus:
                logger.info(
                    "Requested {} GPU(s), setting CUDA_VISIBLE_DEVICES={}".
                    format(requested_gpus if user_requested else len(gpus),
                           gpus_to_use))
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use