def from_d(d): # fixme from pbsmrtpipe.cluster import ClusterTemplateRender, ClusterTemplate if d['cluster']: tmplates = [ClusterTemplate(k, v) for k, v in d['cluster'].iteritems()] c = ClusterTemplateRender(tmplates) else: c = None task = Task.from_d(d['task']) return RunnableTask(task, c, d['env'])
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) env_json = os.path.join(output_dir, 'env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems() ] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') mstdout = _to_p('mstdout') mstderr = _to_p('mstderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format( i=job_id, r=runnable_task)) debug_str = " --debug " _d = dict(t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=mstdout, n=mstderr) cmd = "pbtools-runner run {d} --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format( **_d) with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") # Make +x os.chmod(rcmd_shell, os.stat(rcmd_shell).st_mode | stat.S_IEXEC) cluster_cmd = render.render('interactive', rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write(cluster_cmd + "\n") os.chmod(qshell, os.stat(qshell).st_mode | stat.S_IEXEC) # host = socket.getfqdn() host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) if rcode == 0: err_msg = "" warn_msg = "" else: # not sure how to scrape this from the stderr/stdout err_msg = "task {i} failed".format(i=runnable_task.task.task_id) warn_msg = "" msg_ = "Completed running cluster command in {t:.2f} sec. Exit code {r}".format( r=rcode, t=run_time) log.info(msg_) with open(qstdout, 'a') as qf: qf.write(str(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: f.write(str(cstderr) + "\n") f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format( r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, run_time
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) # the quoting here is explicitly to handle spaces in paths cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) # write the pbtools-runner exe command with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") chmod_x(rcmd_shell) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.info("Job submission command: " + cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write("set -o errexit\n") f.write("set -o pipefail\n") f.write("set -o nounset\n") f.write(cluster_cmd.rstrip("\n") + " ${1+\"$@\"}\n") f.write("exit $?") chmod_x(qshell) host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) # print the underlying jms command if using runjmscmd if re.search(r'/runjmscmd\b', cluster_cmd): rcode, cstdout, cstderr, run_time = backticks("bash {q} --printcmd".format(q=qshell)) if rcode == 0: log.info("Underlying JMS job submission command: " + "\n".join(cstdout)) # Blocking call rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) log.info("Cluster command return code {r} in {s:.2f} sec".format(r=rcode, s=run_time)) msg_t = "{n} Completed running cluster command in {t:.2f} sec. Exit code {r} (task-type {i})" msg_ = msg_t.format(r=rcode, t=run_time, i=runnable_task.task.task_type_id, n=datetime.datetime.now()) log.info(msg_) # Append the bash cluster.sh stderr and stdout call to # the cluster.stderr and cluster.stdout with open(qstdout, 'a') as qf: if cstdout: qf.write("\n".join(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(str(cstderr) + "\n") # fundamental output error str message of this func err_msg = "" warn_msg = "" if rcode != 0: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(qstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" # write the result status message to stderr if task failure # doing this here to avoid having a double message with open(qstderr, 'a') as f: if rcode != 0: if cstderr: f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, err_msg, run_time
def test_cluster_template_bad_template_type(self): with self.assertRaises(ValueError) as e: t = ClusterTemplate('BAD TYPE', 'qdel ${JOB_ID}') log.error(e)
def test_cluster_template_str(self): t = ClusterTemplate('kill', 'qdel ${JOB_ID}') self.assertIsNotNone(str(t)) self.assertIsNotNone(repr(t))
def run_task_on_cluster(runnable_task, task_manifest_path, output_dir, debug_mode): """ :param runnable_task: :param output_dir: :param debug_mode: :return: :type runnable_task: RunnableTask """ def _to_p(x_): return os.path.join(output_dir, x_) stdout_ = _to_p('stdout') stderr_ = _to_p('stderr') if runnable_task.task.is_distributed is False: return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) if runnable_task.cluster is None: log.warn("No cluster provided. Running task locally.") return run_task(runnable_task, output_dir, stdout_, stderr_, debug_mode) os.chdir(runnable_task.task.output_dir) env_json = os.path.join(output_dir, '.cluster-env.json') IO.write_env_to_json(env_json) # sloppy API if isinstance(runnable_task.cluster, ClusterTemplateRender): render = runnable_task.cluster else: ctmpls = [ClusterTemplate(name, tmpl) for name, tmpl in runnable_task.cluster.iteritems()] render = ClusterTemplateRender(ctmpls) job_id = to_random_job_id(runnable_task.task.task_id) log.debug("Using job id {i}".format(i=job_id)) qstdout = _to_p('cluster.stdout') qstderr = _to_p('cluster.stderr') qshell = _to_p('cluster.sh') rcmd_shell = _to_p('run.sh') # This needs to be flattened due to the new RTC layer # Task Manifest Runner output stdout = _to_p('stdout') stderr = _to_p('stderr') mstdout = _to_p('mstdout') mstderr = _to_p('mstderr') with open(qstdout, 'w+') as f: f.write("Creating cluster stdout for Job {i} {r}\n".format(i=job_id, r=runnable_task)) debug_str = " --debug " exe = _resolve_exe("pbtools-runner") _d = dict(x=exe, t=task_manifest_path, o=stdout, e=stderr, d=debug_str, m=stdout, n=stderr, r=output_dir) cmd = "{x} run {d} --output-dir=\"{r}\" --task-stderr=\"{e}\" --task-stdout=\"{o}\" \"{t}\" > \"{m}\" 2> \"{n}\"".format(**_d) with open(rcmd_shell, 'w+') as x: x.write(cmd + "\n") # Make +x os.chmod(rcmd_shell, os.stat(rcmd_shell).st_mode | stat.S_IEXEC) cluster_cmd = render.render(ClusterConstants.START, rcmd_shell, job_id, qstdout, qstderr, runnable_task.task.nproc) log.debug(cluster_cmd) with open(qshell, 'w') as f: f.write("#!/bin/bash\n") f.write("set -o errexit\n") f.write("set -o pipefail\n") f.write("set -o nounset\n") f.write(cluster_cmd + "\n") f.write("exit $?") os.chmod(qshell, os.stat(qshell).st_mode | stat.S_IEXEC) # host = socket.getfqdn() host = platform.node() # so core dumps are written to the job dir os.chdir(output_dir) rcode, cstdout, cstderr, run_time = backticks("bash {q}".format(q=qshell)) log.info("Cluster command return code {r} in {s:.2f} sec".format(r=rcode, s=run_time)) if rcode == 0: err_msg = "" warn_msg = "" else: p_err_msg = "task {i} failed (exit-code {x}) after {r:.2f} sec".format(i=runnable_task.task.task_id, r=run_time, x=rcode) raw_stderr = _extract_last_nlines(stderr) cluster_raw_stderr = _extract_last_nlines(qstderr) err_msg = "\n".join([p_err_msg, raw_stderr, cluster_raw_stderr]) warn_msg = "" msg_ = "Completed running cluster command in {t:.2f} sec. Exit code {r} (task-type {i})".format(r=rcode, t=run_time, i=runnable_task.task.task_type_id) log.info(msg_) with open(qstdout, 'a') as qf: qf.write(str(cstdout) + "\n") qf.write(msg_ + "\n") with open(qstderr, 'a') as f: if rcode != 0: f.write(str(cstderr) + "\n") f.write(msg_ + "\n") r = to_task_report(host, runnable_task.task.task_id, run_time, rcode, err_msg, warn_msg) task_report_path = os.path.join(output_dir, 'task-report.json') msg = "Writing task id {i} task report to {r}".format(r=task_report_path, i=runnable_task.task.task_id) log.info(msg) r.write_json(task_report_path) return rcode, err_msg, run_time
def test_cluster_template_str(self): t = ClusterTemplate(ClusterConstants.STOP, 'qdel ${JOB_ID}') self.assertIsNotNone(str(t)) self.assertIsNotNone(repr(t))