def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::pbsmrtpipe.log", FileTypes.LOG.file_type_id, pb_log_path) master_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::master.log", FileTypes.LOG.file_type_id, master_log_path) ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings(workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) log.info("Workflow Options:") log.info(pprint.pformat(workflow_level_opts.to_dict(), indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env() log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info("completed setting up job directory resources and logs in {r}".format(r=job_root_dir)) return job_resources, ds
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType ds = write_and_initialize_data_store_json(job_resources.datastore_json, []) slog.info("successfully initialized datastore.") write_workflow_settings(workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) log.info("Workflow Options:") log.info(pprint.pformat(workflow_level_opts.to_dict(), indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env() log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info("completed setting up job directory resources and logs in {r}".format(r=job_root_dir)) return job_resources, ds
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') mstdout = _to_p('mstdout') mstderr = _to_p('mstderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") # Make +x os.chmod(rcmd_shell, os.stat(rcmd_shell).st_mode | stat.S_IEXEC) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write(cluster_cmd + "\n") os.chmod(qshell, os.stat(qshell).st_mode | stat.S_IEXEC) # host = socket.getfqdn() host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) if rcode == 0: err_msg = "" warn_msg = "" else: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(cstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" msg_ = "Completed running cluster command in {t:.2f} sec. Exit code {r}".format(r=rcode, t=run_time) log.info(msg_) with open(qstdout, 'a') as qf: qf.write(str(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: f.write(str(cstderr) + "\n") f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, run_time
def run_task(runnable_task, output_dir, task_stdout, task_stderr, debug_mode): """ Run a runnable task locally. :param runnable_task: :type runnable_task: RunnableTask :return: exit code, run_time :rtype: (int, int) """ started_at = time.time() rcode = -1 err_msg = "" # host = socket.getfqdn() host = platform.node() ncmds = len(runnable_task.task.cmds) # so core dumps are written to the job dir os.chdir(output_dir) env_json = os.path.join(output_dir, '.env.json') IO.write_env_to_json(env_json) with open(task_stdout, 'w') as stdout_fh: with open(task_stderr, 'w') as stderr_fh: stdout_fh.write(repr(runnable_task) + "\n") stdout_fh.write("Created at {x} on {h}\n".format(x=datetime.datetime.now(), h=host)) stderr_fh.write("Running task in {o}\n".format(o=output_dir)) # Validate Inputs for input_file in runnable_task.task.input_files: if os.path.exists(input_file): stdout_fh.write("Validated INPUT file '{i}\n".format(i=input_file)) else: err_msg = "Unable to find INPUT file '{i}".format(i=input_file) stderr_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") break # Create resources if necessary #if runnable_task.task.resources: # create_tmp_resources_ignore_error(runnable_task.task.resources) # Run commands for i, cmd in enumerate(runnable_task.task.cmds): log.debug("Running command \n" + cmd) rcode, out, error, run_time = run_command(cmd, stdout_fh, stderr_fh, time_out=None) if rcode != 0: err_msg_ = "Failed task {i} exit code {r} in {s:.2f} sec".format(i=runnable_task.task.task_id, r=rcode, s=run_time) t_error_msg = _extract_last_nlines(task_stderr) err_msg = "\n".join([err_msg_, t_error_msg]) log.error(err_msg) log.error(error) stderr_fh.write(str(error) + "\n") sys.stderr.write(str(error) + "\n") stderr_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") break else: stdout_fh.write("completed running cmd {i} of {n}. exit code {x} in {s:.2f} sec on host {h}\n".format(x=rcode, s=run_time, h=host, i=i + 1, n=ncmds)) smsg_ = "completed running commands. Exit code {i}".format(i=rcode) log.debug(smsg_) if rcode == 0: # Validate output files of a successful task. for ix, output_file in enumerate(runnable_task.task.output_files): if os.path.exists(output_file): stdout_fh.write("Successfully validated {i} output file '{o}' on {h} \n".format(o=output_file, i=ix, h=host)) else: err_msg = "Unable to find {i} output file '{x}'. Marking task as failed.".format(x=output_file, i=ix) stderr_fh.write(err_msg + "\n") stdout_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") rcode = -1 total_run_time = time.time() - started_at # FIXME. There should be a better way to communicate warnings warn_msg = "" r = to_task_report(host, runnable_task.task.task_id, total_run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) stdout_fh.write(msg + "\n") r.write_json(task_report_path) stderr_fh.flush() stdout_fh.flush() # Cleanup resource files if not debug_mode and runnable_task.task.resources: try: cleanup_resources(runnable_task) log.debug("successfully cleaned up {n} resources.".format(n=len(runnable_task.task.resources))) except Exception as e: log.error(str(e)) log.error("failed to successfully cleanup resources. {f}".format(f=runnable_task.task.resources)) run_time = time.time() - started_at return rcode, run_time
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) env_json = os.path.join(output_dir, 'env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems() ] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') mstdout = _to_p('mstdout') mstderr = _to_p('mstderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format( i=job_id, r=runnable_task)) debug_str = " --debug " _d = dict(t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=mstdout, n=mstderr) cmd = "pbtools-runner run {d} --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format( **_d) with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") # Make +x os.chmod(rcmd_shell, os.stat(rcmd_shell).st_mode | stat.S_IEXEC) cluster_cmd = render.render('interactive', rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write(cluster_cmd + "\n") os.chmod(qshell, os.stat(qshell).st_mode | stat.S_IEXEC) # host = socket.getfqdn() host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) if rcode == 0: err_msg = "" warn_msg = "" else: # not sure how to scrape this from the stderr/stdout err_msg = "task {i} failed".format(i=runnable_task.task.task_id) warn_msg = "" msg_ = "Completed running cluster command in {t:.2f} sec. Exit code {r}".format( r=rcode, t=run_time) log.info(msg_) with open(qstdout, 'a') as qf: qf.write(str(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: f.write(str(cstderr) + "\n") f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format( r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, run_time
def run_task(runnable_task, output_dir, task_stdout, task_stderr, debug_mode): """ Run a runnable task locally. :param runnable_task: :type runnable_task: RunnableTask :return: exit code, run_time :rtype: (int, int) """ started_at = time.time() rcode = -1 err_msg = "" # host = socket.getfqdn() host = platform.node() ncmds = len(runnable_task.task.cmds) # so core dumps are written to the job dir os.chdir(output_dir) env_json = os.path.join(output_dir, 'env.json') IO.write_env_to_json(env_json) with open(task_stdout, 'w') as stdout_fh: with open(task_stderr, 'w') as stderr_fh: stdout_fh.write(repr(runnable_task) + "\n") stdout_fh.write("Created at {x} on {h}\n".format( x=datetime.datetime.now(), h=host)) # Validate Inputs for input_file in runnable_task.task.input_files: if os.path.exists(input_file): stdout_fh.write( "Validated INPUT file '{i}\n".format(i=input_file)) else: err_msg = "Unable to find INPUT file '{i}".format( i=input_file) stderr_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") break # Create resources if necessary #if runnable_task.task.resources: # create_tmp_resources_ignore_error(runnable_task.task.resources) # Run commands for i, cmd in enumerate(runnable_task.task.cmds): log.debug("Running command \n" + cmd) rcode, out, error, run_time = run_command(cmd, stdout_fh, stderr_fh, time_out=None) if rcode != 0: err_msg = "Failed task {i} exit code {r} in {s:.2f} sec".format( i=runnable_task.task.task_id, r=rcode, s=run_time) log.error(err_msg) log.error(error) stderr_fh.write(str(error) + "\n") sys.stderr.write(str(error) + "\n") stderr_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") break else: stdout_fh.write( "completed running cmd {i} of {n}. exit code {x} in {s:.2f} sec on host {h}\n" .format(x=rcode, s=run_time, h=host, i=i + 1, n=ncmds)) smsg_ = "completed running commands. Exit code {i}".format(i=rcode) log.debug(smsg_) if rcode == 0: # Validate output files of a successful task. for output_file in runnable_task.task.output_files: if os.path.exists(output_file): stdout_fh.write( "Successfully validated file '{o}'\n".format( o=output_file)) else: err_msg = "Unable to find file '{x}'".format( x=output_file) stderr_fh.write(err_msg + "\n") stdout_fh.write(err_msg + "\n") sys.stderr.write(err_msg + "\n") rcode = -1 total_run_time = time.time() - started_at warn_msg = "" r = to_task_report(host, runnable_task.task.task_id, total_run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format( r=task_report_path, i=runnable_task.task.task_id) log.info(msg) stdout_fh.write(msg + "\n") r.write_json(task_report_path) stderr_fh.flush() stdout_fh.flush() # Cleanup resource files if not debug_mode and runnable_task.task.resources: try: cleanup_resources(runnable_task) log.debug("successfully cleaned up {n} resources.".format( n=len(runnable_task.task.resources))) except Exception as e: log.error(str(e)) log.error("failed to successfully cleanup resources. {f}".format( f=runnable_task.task.resources)) run_time = time.time() - started_at return rcode, run_time
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) # the quoting here is explicitly to handle spaces in paths cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) # write the pbtools-runner exe command with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") chmod_x(rcmd_shell) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.info("Job submission command: " + cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write("set -o errexit\n") f.write("set -o pipefail\n") f.write("set -o nounset\n") f.write(cluster_cmd.rstrip("\n") + " ${1+\"$@\"}\n") f.write("exit $?") chmod_x(qshell) host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) # print the underlying jms command if using runjmscmd if re.search(r'/runjmscmd\b', cluster_cmd): rcode, cstdout, cstderr, run_time = backticks("bash {q} --printcmd".format(q=qshell)) if rcode == 0: log.info("Underlying JMS job submission command: " + "\n".join(cstdout)) # Blocking call rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) log.info("Cluster command return code {r} in {s:.2f} sec".format(r=rcode, s=run_time)) msg_t = "{n} Completed running cluster command in {t:.2f} sec. Exit code {r} (task-type {i})" msg_ = msg_t.format(r=rcode, t=run_time, i=runnable_task.task.task_type_id, n=datetime.datetime.now()) log.info(msg_) # Append the bash cluster.sh stderr and stdout call to # the cluster.stderr and cluster.stdout with open(qstdout, 'a') as qf: if cstdout: qf.write("\n".join(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(str(cstderr) + "\n") # fundamental output error str message of this func err_msg = "" warn_msg = "" if rcode != 0: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(qstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" # write the result status message to stderr if task failure # doing this here to avoid having a double message with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, err_msg, run_time
def run_task(runnable_task, output_dir, task_stdout, task_stderr, debug_mode): """ Run a runnable task locally. :param debug_mode: Enabling debug mode will not cleanup temp resources upon failure :type debug_mode: bool :param runnable_task: Runnable task instance :type runnable_task: RunnableTask :param output_dir: Path to output dir :type output_dir: str :param task_stderr: Absolute path to task stderr file :type task_stderr: str :param task_stdout: Absolute path to task stdout file :type task_stdout: str :return: (exit code, error message, run_time) :rtype: (int, str, int) """ started_at = time.time() def get_run_time(): return time.time() - started_at # Default general catch all rcode = 1 err_msg = "" # host = socket.getfqdn() host = platform.node() ncmds = len(runnable_task.task.cmds) # so core dumps are written to the job dir os.chdir(output_dir) env_json = os.path.join(output_dir, '.env.json') IO.write_env_to_json(env_json) with open(task_stdout, 'w') as stdout_fh: with open(task_stderr, 'w') as stderr_fh: stdout_fh.write(repr(runnable_task) + "\n") stdout_fh.write("Created at {x} on {h}\n".format(x=datetime.datetime.now(), h=host)) stdout_fh.write("Running task in {o}\n".format(o=output_dir)) # Validate Inputs for input_file in runnable_task.task.input_files: if os.path.exists(input_file): stdout_fh.write("Validated INPUT file '{i}\n".format(i=input_file)) else: err_msg = "Unable to find INPUT file '{i}".format(i=input_file) stderr_fh.write(err_msg + "\n") log.error(err_msg) break # Create resources if necessary #if runnable_task.task.resources: # create_tmp_resources_ignore_error(runnable_task.task.resources) stdout_fh.write("Starting to run {n} cmds.".format(n=len(runnable_task.task.cmds))) stdout_fh.flush() stderr_fh.flush() for i, cmd in enumerate(runnable_task.task.cmds): log.info("Running command \n" + cmd) # see run_command API for future fixes rcode, _, _, run_time = run_command(cmd, stdout_fh, stderr_fh, time_out=None) if rcode != 0: err_msg_ = "Failed task {i} exit code {r} in {s:.2f} sec (See file '{f}'.)".format(i=runnable_task.task.task_id, r=rcode, s=run_time, f=task_stderr) stderr_fh.write(err_msg + "\n") stderr_fh.flush() t_error_msg = _extract_last_nlines(task_stderr) err_msg = "\n".join([err_msg_, "Extracted from stderr", t_error_msg]) log.error(err_msg) stdout_fh.write("breaking out. Unable to run remaining task commands.") break else: stdout_fh.write("completed running cmd {i} of {n}. exit code {x} in {s:.2f} sec on host {h}\n".format(x=rcode, s=run_time, h=host, i=i + 1, n=ncmds)) smsg_ = "completed running commands. Exit code {i}".format(i=rcode) log.debug(smsg_) if rcode == 0: log.info("Core RTC runner was successful. Validating output files.") # Validate output files of a successful task. for ix, output_file in enumerate(runnable_task.task.output_files): if os.path.exists(output_file): stdout_fh.write("Successfully validated {i} output file '{o}' on {h} \n".format(o=output_file, i=ix, h=host)) else: rcode = 127 err_msg = "Unable to find {i} output file '{x}'. Marking task as failed. Setting exit code to {r}".format(x=output_file, i=ix, r=rcode) stderr_fh.write(err_msg + "\n") stdout_fh.write(err_msg + "\n") log.error(err_msg) # FIXME. There should be a better way to communicate warnings warn_msg = "" # Write the task summary to a pbcommand Report object r = to_task_report(host, runnable_task.task.task_id, get_run_time(), rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) stdout_fh.write(msg + "\n") r.write_json(task_report_path) stderr_fh.flush() stdout_fh.flush() # Cleanup resource files if not debug_mode and runnable_task.task.resources: try: cleanup_resources(runnable_task) log.debug("successfully cleaned up {n} resources.".format(n=len(runnable_task.task.resources))) except Exception as e: log.error(str(e)) log.error("failed to successfully cleanup resources. {f}".format(f=runnable_task.task.resources)) return rcode, err_msg, get_run_time()
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe v{v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::pbsmrtpipe.log", FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe log") master_log_df = DataStoreFile(str(uuid.uuid4()), "pbsmrtpipe::master.log", FileTypes.LOG.file_type_id, master_log_path, name="Master Log", description="Master log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) try: sa_system, sa_components = IO.get_smrtanalysis_system_and_components_from_env( ) log.info(sa_system) for c in sa_components: log.info(c) except Exception: # black hole exception log.warn("unable to determine SMRT Analysis version.") pass slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') mstdout = _to_p('mstdout') mstderr = _to_p('mstderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") # Make +x os.chmod(rcmd_shell, os.stat(rcmd_shell).st_mode | stat.S_IEXEC) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write("set -o errexit\n") f.write("set -o pipefail\n") f.write("set -o nounset\n") f.write(cluster_cmd + "\n") f.write("exit $?") os.chmod(qshell, os.stat(qshell).st_mode | stat.S_IEXEC) # host = socket.getfqdn() host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) log.info("Cluster command return code {r} in {s:.2f} sec".format(r=rcode, s=run_time)) if rcode == 0: err_msg = "" warn_msg = "" else: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(qstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" msg_ = "Completed running cluster command in {t:.2f} sec. Exit code {r} (task-type {i})".format(r=rcode, t=run_time, i=runnable_task.task.task_type_id) log.info(msg_) with open(qstdout, 'a') as qf: qf.write(str(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: f.write(str(cstderr) + "\n") f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, err_msg, run_time
def job_resource_create_and_setup_logs(job_root_dir, bg, task_opts, workflow_level_opts, ep_d): """ Create job resource dirs and setup log handlers :type job_root_dir: str :type bg: BindingsGraph :type task_opts: dict :type workflow_level_opts: WorkflowLevelOptions :type ep_d: dict """ job_resources = to_job_resources_and_create_dirs(job_root_dir) pb_log_path = os.path.join(job_resources.logs, 'pbsmrtpipe.log') master_log_path = os.path.join(job_resources.logs, "master.log") master_log_level = logging.INFO stdout_level = logging.INFO if workflow_level_opts.debug_mode: master_log_level = logging.DEBUG stdout_level = logging.DEBUG setup_internal_logs(master_log_path, master_log_level, pb_log_path, stdout_level) log.info("Starting pbsmrtpipe {v}".format(v=pbsmrtpipe.get_version())) log.info("\n" + _log_pbsmrptipe_header()) BU.write_binding_graph_images(bg, job_resources.workflow) write_entry_points_json(job_resources.entry_points_json, ep_d) # Need to map entry points to a FileType and store in the DataStore? or # does DataStore only represent outputs? # For historical reasons, this is a bit non-obvious. The "master" log is now at the # the SMRT Link level, so we've promoted the pbsmrtpipe "master" log (i.e., master.log) to the # be Analysis Details Log using the pbsmrtpipe::pbsmrtpipe.log source Id. There's also this friction point # of marketing using "Analysis" vs "pbsmrtpipe which has generated some inconsistency. smrtpipe_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_INFO_LOG, FileTypes.LOG.file_type_id, pb_log_path, name="Analysis Log", description="pbsmrtpipe INFO log") master_log_df = DataStoreFile(str(uuid.uuid4()), GlobalConstants.SOURCE_ID_MASTER_LOG, FileTypes.LOG.file_type_id, master_log_path, name="Analysis Details Log", description="Analysis Details log") ds = write_and_initialize_data_store_json(job_resources.datastore_json, [smrtpipe_log_df, master_log_df]) slog.info("successfully initialized datastore.") write_workflow_settings( workflow_level_opts, os.path.join(job_resources.workflow, 'options-workflow.json')) if workflow_level_opts.system_message is not None: slog.info("Command: {m}".format(m=workflow_level_opts.system_message)) slog.info("Entry Points:") slog.info("\n" + pprint.pformat(ep_d, indent=4)) slog.info("Workflow Options:") slog.info("\n" + pprint.pformat(workflow_level_opts.to_dict(), indent=4)) slog.info("Task Options:") slog.info("\n" + pprint.pformat(task_opts, indent=4)) task_opts_path = os.path.join(job_resources.workflow, 'options-task.json') with open(task_opts_path, 'w') as f: f.write(json.dumps(task_opts, sort_keys=True, indent=4)) env_path = os.path.join(job_resources.workflow, '.env.json') IO.write_env_to_json(env_path) log.info("wrote current env to {e}".format(e=env_path)) slog.info( "completed setting up job directory resources and logs in {r}".format( r=job_root_dir)) return job_resources, ds, master_log_df
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) # the quoting here is explicitly to handle spaces in paths cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) # write the pbtools-runner exe command with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") chmod_x(rcmd_shell) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write("set -o errexit\n") f.write("set -o pipefail\n") f.write("set -o nounset\n") f.write(cluster_cmd + "\n") f.write("exit $?") chmod_x(qshell) host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) # Blocking call rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) log.info("Cluster command return code {r} in {s:.2f} sec".format(r=rcode, s=run_time)) msg_t = "{n} Completed running cluster command in {t:.2f} sec. Exit code {r} (task-type {i})" msg_ = msg_t.format(r=rcode, t=run_time, i=runnable_task.task.task_type_id, n=datetime.datetime.now()) log.info(msg_) # Append the bash cluster.sh stderr and stdout call to # the cluster.stderr and cluster.stdout with open(qstdout, 'a') as qf: if cstdout: qf.write("\n".join(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(str(cstderr) + "\n") # fundamental output error str message of this func err_msg = "" warn_msg = "" if rcode != 0: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(qstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" # write the result status message to stderr if task failure # doing this here to avoid having a double message with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, err_msg, run_time