def _wrapper_fun(iter): for i in iter: executor_num = i tb_pid = 0 tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() global local_logdir_bool try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] val = _get_metric(param_string, app_id, generation_id, run_id) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string, 'differential_evolution', sub_type='generation.' + str(generation_id)) pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir_bool) gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') if val: print('Reading returned metric from previous run: ' + str(val)) hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() if not val: val = map_fun(*args) task_end = datetime.datetime.now() time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff( task_start, task_end) print('\n' + time_str) hopshdfs.log(time_str) try: castval = int(val) except: raise ValueError( 'Your function needs to return a metric (number) which should be maximized or minimized' ) metric_file = hdfs_exec_logdir + '/metric' fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(metric_file, mode='w') except: fd = fs_handle.open_file(metric_file, flags='w') fd.write(str(float(val)).encode()) fd.flush() fd.close() print('Returning metric ' + str(val)) print( '-------------------------------------------------------') except: #Always do cleanup if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if local_logdir_bool: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, hdfs_exec_logdir) hopshdfs.log('Finished running') if tb_hdfs_path: _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _wrapper_fun(iter): for i in iter: executor_num = i tb_hdfs_path = '' hdfs_exec_logdir = '' t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() try: #Arguments if args_dict: argcount = six.get_function_code(map_fun).co_argcount names = six.get_function_code(map_fun).co_varnames args = [] argIndex = 0 param_string = '' while argcount > 0: #Get args for executor and run function param_name = names[argIndex] param_val = args_dict[param_name][executor_num] param_string += str(param_name) + '=' + str( param_val) + '.' args.append(param_val) argcount -= 1 argIndex += 1 param_string = param_string[:-1] hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string, 'grid_search') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, executor_num, local_logdir=local_logdir) gpu_str = '\nChecking for GPUs in the environment' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) print( '-------------------------------------------------------') print('Started running task ' + param_string + '\n') hopshdfs.log('Started running task ' + param_string) task_start = datetime.datetime.now() retval = map_fun(*args) task_end = datetime.datetime.now() _handle_return(retval, hdfs_exec_logdir) time_str = 'Finished task ' + param_string + ' - took ' + util.time_diff( task_start, task_end) print('\n' + time_str) print( '-------------------------------------------------------') hopshdfs.log(time_str) except: #Always do cleanup _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join() raise finally: if local_logdir: local_tb = tensorboard.local_logdir_path util.store_local_tensorboard(local_tb, hdfs_exec_logdir) _cleanup(tb_hdfs_path) if devices.get_num_gpus() > 0: t.do_run = False t.join()
def _mapfn(iter): # Note: consuming the input iterator helps Pyspark re-use this worker, for i in iter: executor_id = i # assign TF job/task based on provided cluster_spec template (or use default/null values) job_name = 'default' task_index = -1 cluster_id = cluster_meta['id'] cluster_template = cluster_meta['cluster_template'] for jobtype in cluster_template: nodes = cluster_template[jobtype] if executor_id in nodes: job_name = jobtype task_index = nodes.index(executor_id) break # get unique key (hostname, executor_id) for this executor host = util.get_ip_address() util.write_executor_id(executor_id) port = 0 # check for existing TFManagers if TFSparkNode.mgr is not None and str( TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor raise Exception( "TFManager already started on {0}, executor={1}, state={2}" .format(host, executor_id, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn( "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}" .format(TFSparkNode.cluster_id, cluster_id)) gpu_present = gpu_info.detect_gpu_present() client = reservation.Client(cluster_meta['server_addr']) logging.info("TFSparkNode.run register: {0}".format(gpu_present)) client.register_gpu_presence(gpu_present) gpus_are_present_on_executors = client.await_gpu_check() logging.info("TFSparkNode.run await_gpu_check: {0}".format( gpus_are_present_on_executors)) # check for existing TFManagers if TFSparkNode.mgr is not None and str( TFSparkNode.mgr.get('state')) != "'stopped'": if TFSparkNode.cluster_id == cluster_id: # raise an exception to force Spark to retry this "reservation" task on another executor raise Exception( "TFManager already started on {0}, state={1}".format( host, str(TFSparkNode.mgr.get("state")))) else: # old state, just continue with creating new manager logging.warn( "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}" .format(TFSparkNode.cluster_id, cluster_id)) # start a TFManager and get a free port # use a random uuid as the authkey authkey = uuid.uuid4().bytes addr = None if (gpus_are_present_on_executors): #Valid PS, does not have GPUs, will be started as a PS if job_name == 'ps' and gpu_present == False: # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) #Invalid worker, all workers should have GPUs, this one will assume role as PS elif job_name == 'worker' and gpu_present == False: # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) #Correct worker else: # worker nodes only need to be locally accessible within the executor for data feeding TFSparkNode.mgr = TFManager.start(authkey, queues) addr = TFSparkNode.mgr.address else: if job_name == 'ps': # PS nodes must be remotely accessible in order to shutdown from Spark driver. TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'], 'remote') addr = (host, TFSparkNode.mgr.address[1]) else: # worker nodes only need to be locally accessible within the executor for data feeding TFSparkNode.mgr = TFManager.start(authkey, queues) addr = TFSparkNode.mgr.address # initialize mgr state TFSparkNode.mgr.set('state', 'running') TFSparkNode.cluster_id = cluster_id # expand Hadoop classpath wildcards for JNI (Spark 2.x) if 'HADOOP_PREFIX' in os.environ: classpath = os.environ['CLASSPATH'] hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin', 'hadoop') hadoop_classpath = subprocess.check_output( [hadoop_path, 'classpath', '--glob']).decode() logging.debug("CLASSPATH: {0}".format(hadoop_classpath)) os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath # start TensorBoard if requested tb_pid = 0 tb_port = 0 # check server to see if this task is being retried (i.e. already reserved) client = reservation.Client(cluster_meta['server_addr']) cluster_info = client.get_reservations() tmp_sock = None node_meta = None for node in cluster_info: (nhost, nexec) = (node['host'], node['executor_id']) if nhost == host and nexec == executor_id: node_meta = node port = node['port'] # if not already done, register everything we need to set up the cluster if node_meta is None: # first, find a free port for TF tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) tmp_sock.bind(('', port)) port = tmp_sock.getsockname()[1] node_meta = { 'executor_id': executor_id, 'host': host, 'job_name': job_name, 'task_index': task_index, 'port': port, 'tb_pid': tb_pid, 'tb_port': tb_port, 'addr': addr, 'authkey': authkey, 'gpu_present': gpu_present } # register node metadata with server logging.info("TFSparkNode.run register: {0}".format(node_meta)) client.register(node_meta) # wait for other nodes to finish reservations cluster_info = client.await_reservations() logging.info( "TFSparkNode.run await_reservations: {0}".format(cluster_info)) client.close() # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} last_executor_id = -1 for node in sorted_cluster_info: if (node['executor_id'] == last_executor_id): raise Exception("Duplicate worker/task in cluster_info") last_executor_id = node['executor_id'] logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts for node in cluster_info: if ((node_meta['host'] == node['host']) and (node_meta['authkey'] == node['authkey'])): job_name = node['job_name'] task_index = node['task_index'] executor_id = node['executor_id'] break hdfs_exec_logdir = '' if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0: # When running with GPUs hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories( app_id, run_id, None, 'tensorflowonspark') tb_proc = tensorboard.register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0: # When running with no GPUs hdfs_exec_logdir, hdfs_appid_logdir = hdfs.create_directories( app_id, run_id, None, 'tensorflowonspark') tb_proc = tensorboard.register(hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) # construct a TensorFlow clusterspec from cluster_info sorted_cluster_info = sorted(cluster_info, key=lambda k: k['executor_id']) spec = {} for node in sorted_cluster_info: logging.info("node: {0}".format(node)) (njob, nhost, nport) = (node['job_name'], node['host'], node['port']) hosts = [] if njob not in spec else spec[njob] hosts.append("{0}:{1}".format(nhost, nport)) spec[njob] = hosts # update TF_CONFIG and reserve GPU for tf.estimator based code # Note: this will execute but be ignored by non-tf.estimator code tf_config = json.dumps({ 'cluster': spec, 'task': { 'type': job_name, 'index': task_index }, 'environment': 'cloud' }) os.environ['TF_CONFIG'] = tf_config # create a context object to hold metadata for TF ctx = TFNodeContext(executor_id, job_name, task_index, spec, cluster_meta['default_fs'], cluster_meta['working_dir'], TFSparkNode.mgr) # release port reserved for TF as late as possible if tmp_sock is not None: tmp_sock.close() # Background mode relies reuse of python worker in Spark. if background: # However, reuse of python worker can't work on Windows, we need to check if the current # script runs on Windows or not. if os.name == 'nt' or platform.system() == 'Windows': raise Exception("Background mode is not supported on Windows.") # Check if the config of reuse python worker is enabled on Spark. if not os.environ.get("SPARK_REUSE_WORKER"): raise Exception( "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background." ) def wrapper_fn(args, context): """Wrapper function that sets the sys.argv of the executor.""" if isinstance(args, list): sys.argv = args fn(args, context) def wrapper_fn_background(args, context): """Wrapper function that signals exceptions to foreground process.""" errq = TFSparkNode.mgr.get_queue('error') try: wrapper_fn(args, context) except Exception: errq.put(traceback.format_exc()) errq.join() if job_name == 'ps' or background: # invoke the TensorFlow main function in a background thread logging.info( "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process" .format(job_name, task_index, job_name, executor_id)) p = multiprocessing.Process(target=wrapper_fn_background, args=(tf_args, ctx)) if job_name == 'ps': p.daemon = True p.start() # for ps nodes only, wait indefinitely in foreground thread for a "control" event (None == "stop") if job_name == 'ps': queue = TFSparkNode.mgr.get_queue('control') equeue = TFSparkNode.mgr.get_queue('error') done = False while not done: while (queue.empty() and equeue.empty()): time.sleep(1) if (not equeue.empty()): e_str = equeue.get() equeue.task_done() raise Exception("exception in ps:\n" + e_str) msg = queue.get(block=True) logging.info("Got msg: {0}".format(msg)) if msg == None: logging.info("Terminating PS") TFSparkNode.mgr.set('state', 'stopped') done = True queue.task_done() else: t = threading.Thread(target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t.start() # otherwise, just run TF function in the main executor/worker thread logging.info( "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread" .format(job_name, task_index, executor_id)) try: wrapper_fn(tf_args, ctx) except: raise finally: if local_logdir: if gpus_are_present_on_executors and gpu_present and job_name == 'worker' and task_index == 0: # When running with GPUs local_tb = tensorboard.local_logdir_path hopsutil.store_local_tensorboard( local_tb, hdfs_exec_logdir) elif not gpus_are_present_on_executors and job_name == 'worker' and task_index == 0: # When running with no GPUs local_tb = tensorboard.local_logdir_path hopsutil.store_local_tensorboard( local_tb, hdfs_exec_logdir) if devices.get_num_gpus() > 0: t.do_run = False t.join() logging.info( "Finished TensorFlow {0}:{1} on cluster node {2}".format( job_name, task_index, executor_id))
def _wrapper_fun(iter): for i in iter: executor_num = i client = coordination_server.Client(server_addr) node_meta = { 'host': get_ip_address(), 'executor_cwd': os.getcwd(), 'cuda_visible_devices_ordinals': devices.get_minor_gpu_device_numbers() } client.register(node_meta) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() # Only spark executor with index 0 should create necessary HDFS directories and start mpirun # Other executors simply block until index 0 reports mpirun is finished clusterspec = client.await_reservations() #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) #hopshdfs.init_logger() #hopshdfs.log('Starting Spark executor with arguments') gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) #hopshdfs.log(gpu_str) print(gpu_str) mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') py_runnable = localize_scripts(nb_path, clusterspec) # non-chief executor should not do mpirun if not executor_num == 0: client.await_mpirun_finished() else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string='Horovod') tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0) mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \ ' -bind-to none -map-by slot ' + \ ' -x LD_LIBRARY_PATH ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ ' -mca pml ob1 -mca btl ^openib ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() client.register_mpirun_finished() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception( 'mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join()
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def begin(spark, name='no-name', local_logdir=False, versioned_resources=None, description=None): """ Start an experiment Args: :spark_session: SparkSession object :name: (optional) name of the job """ global running if running: raise RuntimeError( "An experiment is currently running. Please call experiment.stop() to stop it." ) try: global app_id global experiment_json global elastic_id global run_id global driver_tensorboard_hdfs_path running = True sc = spark.sparkContext app_id = str(sc.applicationId) run_id = run_id + 1 versioned_path = util.version_resources(versioned_resources, get_logdir(app_id)) experiment_json = None experiment_json = util.populate_experiment(sc, name, 'experiment', 'begin', get_logdir(app_id), None, versioned_path, description) util.version_resources(versioned_resources, get_logdir(app_id)) util.put_elastic(hopshdfs.project_name(), app_id, elastic_id, experiment_json) hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'begin') pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() driver_tensorboard_hdfs_path, _ = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir, tensorboard_driver=True) except: exception_handler() raise return