def _restart_debugging(interactive=True): global tb_pid #Kill existing TB proc = subprocess.Popen(["kill", str(tb_pid)]) proc.wait() debugger_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) debugger_socket.bind(('', 0)) debugger_addr, debugger_port = debugger_socket.getsockname() debugger_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' global pypath global tb_path global tb_port if interactive: tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir(), "--port=%d" % tb_port, "--debugger_port=%d" % debugger_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid if not interactive: tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir(), "--port=%d" % tb_port, "--debugger_data_server_grpc_port=%d" % debugger_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid time.sleep(2) return 'localhost:' + str(debugger_port)
def visualize(spark_session, hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :spark_session: SparkSession object :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = spark_session.sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util.find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_proc = subprocess.Popen( [pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs.get_experiments_dir( ) + "/" + app_id + "/TensorBoard.driver" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def _wrapper_fun(iter): for i in iter: executor_num = i client = coordination_server.Client(server_addr) node_meta = { 'host': get_ip_address(), 'executor_cwd': os.getcwd(), 'cuda_visible_devices_ordinals': devices.get_minor_gpu_device_numbers() } client.register(node_meta) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() # Only spark executor with index 0 should create necessary HDFS directories and start mpirun # Other executors simply block until index 0 reports mpirun is finished clusterspec = client.await_reservations() #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) #hopshdfs.init_logger() #hopshdfs.log('Starting Spark executor with arguments') gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) #hopshdfs.log(gpu_str) print(gpu_str) mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') py_runnable = localize_scripts(nb_path, clusterspec) # non-chief executor should not do mpirun if not executor_num == 0: client.await_mpirun_finished() else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string='Horovod') tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0) mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \ ' -bind-to none -map-by slot ' + \ ' -x LD_LIBRARY_PATH ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ ' -mca pml ob1 -mca btl ^openib ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() client.register_mpirun_finished() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception( 'mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join()
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False, tensorboard_driver=False): global tb_pid if tb_pid != 0: subprocess.Popen(["kill", str(tb_pid)]) _reset_global() global events_logdir events_logdir = hdfs_exec_dir global local_logdir_bool local_logdir_bool = local_logdir if tb_pid == 0: global pypath pypath = os.getenv("PYSPARK_PYTHON") #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) global tb_port tb_addr, tb_port = tb_socket.getsockname() global tb_path tb_path = util.find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_proc = None global local_logdir_path if local_logdir: local_logdir_path = os.getcwd() + '/local_logdir' if os.path.exists(local_logdir_path): shutil.rmtree(local_logdir_path) os.makedirs(local_logdir_path) else: os.makedirs(local_logdir_path) tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % local_logdir_path, "--port=%d" % tb_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) else: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % events_logdir, "--port=%d" % tb_port ], env=tb_env, preexec_fn=util.on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid host = socket.gethostname() global tb_url tb_url = "http://{0}:{1}".format(host, tb_port) global endpoint if tensorboard_driver: endpoint = endpoint_dir + "/TensorBoard.driver" else: endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num) #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user()) return endpoint, tb_pid