def _restart_debugging(interactive=True): """ Args: interactive: Returns: """ global tb_pid #Kill existing TB proc = subprocess.Popen(["kill", str(tb_pid)]) proc.wait() debugger_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) debugger_socket.bind(('', 0)) debugger_addr, debugger_port = debugger_socket.getsockname() debugger_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_env['LC_ALL'] = 'C' tb_env['TMPDIR'] = os.getcwd() global pypath global tb_path global tb_port if interactive: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % logdir(), "--port=%d" % tb_port, "--debugger_port=%d" % debugger_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid if not interactive: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % logdir(), "--port=%d" % tb_port, "--debugger_data_server_grpc_port=%d" % debugger_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid time.sleep(2) return 'localhost:' + str(debugger_port)
def visualize(hdfs_root_logdir): """ Visualize all TensorBoard events for a given path in HopsFS. This is intended for use after running TensorFlow jobs to visualize them all in the same TensorBoard. tflauncher.launch returns the path in HopsFS which should be handed as argument for this method to visualize all runs. Args: :hdfs_root_logdir: the path in HopsFS to enter as the logdir for TensorBoard """ sc = util._find_spark().sparkContext app_id = str(sc.applicationId) pypath = os.getenv("PYSPARK_PYTHON") logdir = os.getcwd() + '/tensorboard_events/' if os.path.exists(logdir): shutil.rmtree(logdir) os.makedirs(logdir) else: os.makedirs(logdir) #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) tb_addr, tb_port = tb_socket.getsockname() tb_path = util._find_tensorboard() tb_socket.close() tb_env = os.environ.copy() tb_env['CUDA_VISIBLE_DEVICES'] = '' tb_env['LC_ALL'] = 'C' tb_proc = subprocess.Popen([ pypath, tb_path, "--logdir=%s" % logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) host = socket.gethostname() tb_url = "http://{0}:{1}".format(host, tb_port) tb_endpoint = hopshdfs._get_experiments_dir( ) + "/" + app_id + "/TensorBoard.visualize" #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, tb_endpoint, user=hopshdfs.project_user()) handle = hopshdfs.get() hdfs_logdir_entries = handle.list_directory(hdfs_root_logdir) for entry in hdfs_logdir_entries: file_name, extension = splitext(entry['name']) if not extension == '.log': pydoop.hdfs.get(entry['name'], logdir) tb_proc.wait() stdout, stderr = tb_proc.communicate() print(stdout) print(stderr)
def start_beam_jobserver(flink_session_name, artifacts_dir="Resources", jobserver_jar=None, sdk_worker_parallelism=1): """ Start the Java Beam job server that connects to the flink session cluster. User needs to provide the job name that started the Flink session and optionally the worker parallelism. Args: :flink_session_name: Job name that runs the Flink session. :sdk_worker_parallelism: Default parallelism for SDK worker processes. This option is only applied when the pipeline option sdkWorkerParallelism is set to 0.Default is 1, If 0, worker parallelism will be dynamically decided by runner.See also: sdkWorkerParallelism Pipeline Option (default: 1). For further documentation, please refer to Apache Beam docs. Returns: artifact_port, expansion_port, job_host, job_port, jobserver.pid """ if jobserver_jar is None: jobserver_jar = os.path.join( util.get_flink_conf_dir(), "beam-runners-flink-1.8-job-server-2.15.0.jar") # Get Flink master URL (flink session cluster) from an ExecutionDTO method = constants.HTTP_CONFIG.HTTP_GET resource_url = constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \ "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \ flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \ "executions" + \ "?limit=1&offset=0&sort_by=submissionTime:desc" response = util.send_request(method, resource_url) response_object = response.json() flink_master_url = response_object['items'][0]['flinkMasterURL'] artifact_port = randint(10000, 65000) expansion_port = randint(10000, 65000) job_port = randint(10000, 65000) job_host = socket.getfqdn() log_base_path = "" if 'LOG_DIRS' in os.environ: log_base_path += os.environ['LOG_DIRS'] + "/" beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \ "-" + str(job_port) + ".log" # copy jar to local with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log, "wb") as err: jobserver = subprocess.Popen( [ "java", "-jar", jobserver_jar, "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir, "--flink-master-url=%s" % flink_master_url, "--artifact-port=%d" % artifact_port, "--expansion-port=%d" % expansion_port, "--job-host=%s" % job_host, "--job-port=%d" % job_port, "--sdk-worker-parallelism=%d" % sdk_worker_parallelism ], stdout=out, stderr=err, preexec_fn=util._on_executor_exit('SIGTERM')) global clusters clusters.append(flink_session_name) global jobserver_host jobserver_host = job_host global jobserver_port jobserver_port = job_port return { "jobserver_log": beam_jobserver_log, "artifact_port": artifact_port, "expansion_port": expansion_port, "job_host": job_host, "job_port": job_port, "jobserver.pid": jobserver.pid }
def _register(hdfs_exec_dir, endpoint_dir, exec_num, local_logdir=False): """ Args: hdfs_exec_dir: endpoint_dir: exec_num: local_logdir: Returns: """ global tb_pid if tb_pid != 0: subprocess.Popen(["kill", str(tb_pid)]) _reset_global() global events_logdir events_logdir = hdfs_exec_dir global local_logdir_bool local_logdir_bool = local_logdir if tb_pid == 0: global pypath pypath = os.getenv("PYSPARK_PYTHON") #find free port tb_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tb_socket.bind(('', 0)) global tb_port tb_addr, tb_port = tb_socket.getsockname() global tb_path tb_path = experiment_utils._find_tensorboard() tb_socket.close() tb_env = _init_tb_env() global local_logdir_path if local_logdir: local_logdir_path = os.getcwd() + '/local_logdir' if os.path.exists(local_logdir_path): shutil.rmtree(local_logdir_path) os.makedirs(local_logdir_path) else: os.makedirs(local_logdir_path) local_logdir_path = local_logdir_path + '/' tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % local_logdir_path, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) else: tb_proc = subprocess.Popen( [ pypath, tb_path, "--logdir=%s" % events_logdir, "--port=%d" % tb_port, "--host=%s" % "0.0.0.0" ], env=tb_env, preexec_fn=util._on_executor_exit('SIGTERM')) tb_pid = tb_proc.pid host = socket.gethostname() global tb_url tb_url = "http://{0}:{1}".format(host, tb_port) global endpoint endpoint = endpoint_dir + "/TensorBoard.task" + str(exec_num) #dump tb host:port to hdfs pydoop.hdfs.dump(tb_url, endpoint, user=hopshdfs.project_user()) return endpoint, tb_pid
def start_beam_jobserver(flink_session_name, artifacts_dir="Resources", jobserver_jar=None): """ Start the Java Beam job server that connects to the flink session cluster. User needs to provide the job name that started the Flink session and optionally the worker parallelism. Args: :flink_session_name: Job name that runs the Flink session. :artifacts_dir: Default dataset to store artifacts. :jobserver_jar: Portability framework jar filename. Returns: artifact_port, expansion_port, job_host, job_port, jobserver.pid """ if jobserver_jar is None: jobserver_jar = os.path.join(util.get_flink_conf_dir(), "beam-runners-flink-1.9-job-server-2.19.0.jar") # Get Flink master URL (flink session cluster) from an ExecutionDTO method = constants.HTTP_CONFIG.HTTP_GET resource_url = constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \ "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \ flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \ "executions" + \ "?limit=1&offset=0&sort_by=submissionTime:desc" response = util.send_request(method, resource_url) response_object = response.json() flink_master_url = response_object['items'][0]['flinkMasterURL'] artifact_port = randint(10000, 65000) expansion_port = randint(10000, 65000) job_port = randint(10000, 65000) job_host = socket.getfqdn() log_base_path = "" if 'LOG_DIRS' in os.environ: log_base_path += os.environ['LOG_DIRS'] + "/" beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \ "-" + str(job_port) + ".log" # copy jar to local with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log, "wb") as err: jobserver = subprocess.Popen(["java", "-jar", jobserver_jar, "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir, "--flink-master-url=%s" % flink_master_url, "--artifact-port=%d" % artifact_port, "--expansion-port=%d" % expansion_port, "--job-host=%s" % job_host, "--job-port=%d" % job_port], stdout=out, stderr=err, preexec_fn=util._on_executor_exit('SIGTERM')) global clusters clusters.append(flink_session_name) global jobserver_host jobserver_host = job_host global jobserver_port jobserver_port = job_port return {"jobserver_log": beam_jobserver_log, "artifact_port": artifact_port, "expansion_port": expansion_port, "job_host": job_host, "job_port": job_port, "jobserver.pid": jobserver.pid}
def start_beam_jobserver( flink_session_name, artifacts_dir="Resources", jobserver_jar=os.path.join(util.get_flink_lib_dir(), "beam-runners-flink-1.9-job-server-2.24.0.jar"), jobserver_main_class="org.apache.beam.runners.flink.FlinkJobServerDriver", service_discover_jar=os.path.join( util.get_flink_lib_dir(), "service-discovery-client-0.5-SNAPSHOT.jar")): """ Start the Java Beam job server that connects to the flink session cluster. User needs to provide the job name that started the Flink session and optionally the worker parallelism. Args: :flink_session_name: Job name that runs the Flink session. :artifacts_dir: Default dataset to store artifacts. :jobserver_jar: Portability framework jar filename. Returns: artifact_port, expansion_port, job_host, job_port, jobserver.pid """ # Get Flink master URL (flink session cluster) from an ExecutionDTO method = constants.HTTP_CONFIG.HTTP_GET resource_url = constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_REST_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ constants.REST_CONFIG.HOPSWORKS_PROJECT_RESOURCE + constants.DELIMITERS.SLASH_DELIMITER + \ hopsfs.project_id() + constants.DELIMITERS.SLASH_DELIMITER + \ "jobs" + constants.DELIMITERS.SLASH_DELIMITER + \ flink_session_name + constants.DELIMITERS.SLASH_DELIMITER + \ "executions" + \ "?limit=1&offset=0&sort_by=submissionTime:desc" response = util.send_request(method, resource_url) response_object = response.json() flink_master_url = response_object['items'][0]['flinkMasterURL'] artifact_port = randint(10000, 65000) expansion_port = randint(10000, 65000) job_port = randint(10000, 65000) job_host = socket.getfqdn() log_base_path = "" if 'LOG_DIRS' in os.environ: log_base_path += os.environ['LOG_DIRS'] + "/" beam_jobserver_log = log_base_path + "beamjobserver-" + hopsfs.project_name().lower() + "-" + flink_session_name + \ "-" + str(job_port) + ".log" # copy jar to local with open(beam_jobserver_log, "wb") as out, open(beam_jobserver_log, "wb") as err: # Get the hadoop glob classpath and filter out service-discover-client as there is a shading issue with # jackson dependency jobserver_cp_list = list( filter( lambda x: "service-discovery" not in x and x.endswith(".jar"), util.get_hadoop_classpath_glob().split(":"))) jobserver_cp_list.extend((service_discover_jar, jobserver_jar)) jobserver_cp_path = ":".join(jobserver_cp_list).replace("\n", "") jobserver = subprocess.Popen( [ "java", "-cp", "%s" % jobserver_cp_path, jobserver_main_class, "--artifacts-dir=%s" % hopsfs.project_path() + artifacts_dir, "--flink-master-url=%s" % flink_master_url, "--artifact-port=%d" % artifact_port, "--expansion-port=%d" % expansion_port, "--job-host=%s" % job_host, "--job-port=%d" % job_port ], stdout=out, stderr=err, preexec_fn=util._on_executor_exit('SIGTERM')) global clusters clusters.append(flink_session_name) global jobserver_host jobserver_host = job_host global jobserver_port jobserver_port = job_port return { "jobserver_log": beam_jobserver_log, "artifact_port": artifact_port, "expansion_port": expansion_port, "job_host": job_host, "job_port": job_port, "jobserver.pid": jobserver.pid }