def __exe_workflow(global_registry, ep_d, bg, task_opts, workflow_opts, output_dir, workers, shutdown_event, service_uri_or_none): """ Core runner of a workflow. :type bg: BindingsGraph :type workflow_opts: WorkflowLevelOptions :type output_dir: str :type service_uri_or_none: str | None :param workers: {taskid:Worker} :return: The function is doing way too much. This is really terrible. """ job_id = random.randint(100000, 999999) started_at = time.time() m_ = "Distributed" if workflow_opts.distributed_mode is not None else "Local" slog.info("starting to execute {m} workflow with assigned job_id {i}".format(i=job_id, m=m_)) log.info("exe'ing workflow Cluster renderer {c}".format(c=global_registry.cluster_renderer)) slog.info("Service URI: {i} {t}".format(i=service_uri_or_none, t=type(service_uri_or_none))) slog.info("Max number of Chunks {n} ".format(n=workflow_opts.max_nchunks)) slog.info("Max number of nproc {n}".format(n=workflow_opts.max_nproc)) slog.info("Max number of workers {n}".format(n=workflow_opts.max_nworkers)) # Setup logger, job directory and initialize DS slog.info("creating job resources in {o}".format(o=output_dir)) job_resources, ds = DU.job_resource_create_and_setup_logs(output_dir, bg, task_opts, workflow_opts, ep_d) slog.info("successfully created job resources.") # Some Pre-flight checks # Help initialize graph/epoints B.resolve_entry_points(bg, ep_d) # Update Task-esque EntryBindingPoint B.resolve_entry_binding_points(bg) # initialization File node attributes for eid, path in ep_d.iteritems(): B.resolve_entry_point(bg, eid, path) B.resolve_successor_binding_file_path(bg) # Mark Chunkable tasks B.label_chunkable_tasks(bg, global_registry.chunk_operators) slog.info("validating binding graph") # Check the degree of the nodes B.validate_binding_graph_integrity(bg) slog.info("successfully validated binding graph.") # Add scattered # This will add new nodes to the graph if necessary B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, workflow_opts.max_nchunks) # log.info(BU.to_binding_graph_summary(bg)) # "global" file type id counter {str: int} that will be # used to generate ids file_type_id_to_count = {file_type.file_type_id: 0 for _, file_type in global_registry.file_types.iteritems()} # Create a closure to allow file types to assign unique id # this returns a func to_resolve_files_func = B.to_resolve_files(file_type_id_to_count) # Local vars max_total_nproc = workflow_opts.total_max_nproc max_nworkers = workflow_opts.max_nworkers max_nproc = workflow_opts.max_nproc max_nchunks = workflow_opts.max_nchunks tmp_dir = workflow_opts.tmp_dir q_out = multiprocessing.Queue() worker_sleep_time = 1 # To store all the reports that are displayed in the analysis.html # {id:task-id, report_path:path/to/report.json} analysis_file_links = [] # Flag for pipeline execution failure has_failed = False # Time to sleep between each step the execution loop # after the first 1 minute of exe, update the sleep time to 2 sec sleep_time = 1 # Running total of current number of slots/cpu's used total_nproc = 0 # Define a bunch of util funcs to try to make the main driver while loop # more understandable. Not the greatest model. def _to_run_time(): return time.time() - started_at def write_analysis_report(analysis_file_links_): analysis_report_html = os.path.join(job_resources.html, 'analysis.html') R.write_analysis_link_report(analysis_file_links_, analysis_report_html) def update_analysis_file_links(task_id_, report_path_): analysis_link = AnalysisLink(task_id_, report_path_) log.info("updating report analysis file links {a}".format(a=analysis_link)) analysis_file_links.append(analysis_link) write_analysis_report(analysis_file_links) # factories for getting a Worker instance # utils for getting the running func and worker type def _to_worker(w_is_distributed, wid, task_id, manifest_path_): # the IO loading will forceful set this to None # if the cluster manager not defined or cluster_mode is False if global_registry.cluster_renderer is None or not w_is_distributed: r_func = T.run_task_manifest else: r_func = T.run_task_manifest_on_cluster return TaskManifestWorker(q_out, shutdown_event, worker_sleep_time, r_func, task_id, manifest_path_, name=wid) # Define a bunch of util funcs to try to make the main driver while loop # more understandable. Not the greatest model. def write_report_(bg_, current_state_, was_successful_): return DU.write_main_workflow_report(job_id, job_resources, workflow_opts, task_opts, bg_, current_state_, was_successful_, _to_run_time()) def write_task_summary_report(bg_): task_summary_report = DU.to_task_summary_report(bg_) p = os.path.join(job_resources.html, 'task_summary.html') R.write_report_to_html(task_summary_report, p) def services_log_update_progress(source_id_, level_, message_): if service_uri_or_none is not None: total_log_uri = "{u}/log".format(u=service_uri_or_none) WS.log_pbsmrtpipe_progress(total_log_uri, message_, level_, source_id_, ignore_errors=True) def services_add_datastore_file(datastore_file_): if service_uri_or_none is not None: total_ds_uri = "{u}/datastore".format(u=service_uri_or_none) WS.add_datastore_file(total_ds_uri, datastore_file_, ignore_errors=True) def _update_analysis_reports_and_datastore(tnode_, task_): for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files): source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id) ds_uuid = _get_dataset_uuid_or_create_uuid(path_) ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_) ds.add(ds_file_) ds.write_update_json(job_resources.datastore_json) # Update Services services_add_datastore_file(ds_file_) dsr = DU.datastore_to_report(ds) R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html')) if file_type_ == FileTypes.REPORT: T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir)) update_analysis_file_links(tnode_.idx, path_) def _log_task_failure_and_call_services(path_to_stderr, task_id_): """log the error messages extracted from stderr""" lines = _get_last_lines_of_stderr(20, path_to_stderr) for line_ in lines: log.error(line_.strip()) # these already have newlines sys.stderr.write(line_) sys.stderr.write("\n") services_log_update_progress("pbsmrtpipe::{i}".format(i=task_id_), WS.LogLevels.ERROR, "\n".join(lines)) def has_available_slots(n): if max_total_nproc is None: return True return total_nproc + n <= max_total_nproc # Misc setup write_report_(bg, TaskStates.CREATED, False) # write empty analysis reports write_analysis_report(analysis_file_links) BU.write_binding_graph_images(bg, job_resources.workflow) # For book-keeping # task id -> tnode tid_to_tnode = {} # tnode -> Task instance tnode_to_task = {} # local loop for adjusting sleep time, this will get reset after each new # task is created niterations = 0 dt_ramp = 0.25 # number of iterations before switching to steady state sleep stead_state_n = 50 # sleep for 5 sec dt_stead_state = 4 try: log.debug("Starting execution loop... in process {p}".format(p=os.getpid())) # After the initial startup, bump up the time to reduce resource usage # (since multiple instances will be launched from the services) niterations += 1 if niterations < stead_state_n: sleep_time = dt_ramp else: sleep_time = dt_stead_state while True: # Convert Task -> ScatterAble task (emits a Chunk.json file) B.apply_scatterable(bg, global_registry.chunk_operators, global_registry.tasks) # This will add new TaskBinding nodes to the graph if necessary B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, max_nchunks) # B.write_binding_graph_images(bg, job_resources.workflow) # If a TaskScatteredBindingNode is completed successfully and # output chunk.json is resolved, read in the file and # generate the new chunked tasks. This mutates the graph # significantly. B.add_gather_to_completed_task_chunks(bg, global_registry.chunk_operators, global_registry.tasks, job_resources.tasks) if not _are_workers_alive(workers): for tix_, w_ in workers.iteritems(): if not w_.is_alive(): log.warn("Worker {i} (pid {p}) is not alive for task {x}. Worker exit code {e}.".format(i=w_.name, p=w_.pid, e=w_.exitcode, x=tix_)) #w_.terminate() # Check if Any tasks are running or that there still runnable tasks is_completed = bg.is_workflow_complete() has_task_running = B.has_running_task(bg) if not has_task_running: if not is_completed: if not B.has_task_in_states(bg, TaskStates.RUNNABLE_STATES()): if not B.has_next_runnable_task(bg): msg = "Unable to find runnable task or any tasks running and workflow is NOT completed." log.error(msg) log.error(BU.to_binding_graph_summary(bg)) services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, msg) raise PipelineRuntimeError(msg) time.sleep(sleep_time) # log.debug("Sleeping for {s}".format(s=sleep_time)) log.debug("\n" + BU.to_binding_graph_summary(bg)) # BU.to_binding_graph_task_summary(bg) # This should only be triggered after events. The main reason # to keep updating it was the html report is up to date with the # runtime write_report_(bg, TaskStates.RUNNING, is_completed) if is_completed: msg_ = "Workflow is completed. breaking out." log.info(msg_) services_log_update_progress("pbsmrtpipe", WS.LogLevels.INFO, msg_) break try: result = q_out.get_nowait() except Queue.Empty: result = None # log.info("Results {r}".format(r=result)) if isinstance(result, TaskResult): niterations = 0 log.debug("Task result {r}".format(r=result)) tid_, state_, msg_, run_time_ = result tnode_ = tid_to_tnode[tid_] task_ = tnode_to_task[tnode_] # Process Successful Task Result if state_ == TaskStates.SUCCESSFUL: msg_ = "Task was successful {r}".format(r=result) slog.info(msg_) # this will raise if a task output is failed to be resolved B.validate_outputs_and_update_task_to_success(bg, tnode_, run_time_, bg.node[tnode_]['task'].output_files) slog.info("Successfully validated outputs of {t}".format(t=repr(tnode_))) services_log_update_progress("pbsmrtpipe::{i}".format(i=tid_), WS.LogLevels.INFO, msg_) B.update_task_output_file_nodes(bg, tnode_, tnode_to_task[tnode_]) B.resolve_successor_binding_file_path(bg) total_nproc -= task_.nproc w_ = workers.pop(tid_) _terminate_worker(w_) # Update Analysis Reports and Register output files to Datastore _update_analysis_reports_and_datastore(tnode_, task_) BU.write_binding_graph_images(bg, job_resources.workflow) else: # Process Non-Successful Task Result B.update_task_state(bg, tnode_, state_) slog.error(result.error_message) log.error(result.error_message + "\n") sys.stderr.write(result.error_message + "\n") _log_task_failure_and_call_services(result.error_message, tid_) # let the remaining running jobs continue w_ = workers.pop(tid_) _terminate_worker(w_) total_nproc -= task_.nproc has_failed = True BU.write_binding_graph_images(bg, job_resources.workflow) _update_msg = _status(bg) log.info(_update_msg) slog.info(_update_msg) s_ = TaskStates.FAILED if has_failed else TaskStates.RUNNING write_report_(bg, s_, False) write_task_summary_report(bg) elif isinstance(result, types.NoneType): pass else: log.error("Unexpected queue result type {t} {r}".format(t=type(result), r=result)) if has_failed: log.error("job has failed. breaking out.") # Just kill everything break # Computational resources are tapped if len(workers) >= max_nworkers: # don't do anything continue tnode = B.get_next_runnable_task(bg) if tnode is None: continue elif isinstance(tnode, TaskBindingNode): niterations = 0 # Found a Runnable Task # base task_id-instance_id tid = '-'.join([tnode.meta_task.task_id, str(tnode.instance_id)]) task_dir = os.path.join(job_resources.tasks, tid) if not os.path.exists(task_dir): os.mkdir(task_dir) to_resources_func = B.to_resolve_di_resources(task_dir, root_tmp_dir=workflow_opts.tmp_dir) input_files = B.get_task_input_files(bg, tnode) # convert metatask -> task try: task = GX.meta_task_to_task(tnode.meta_task, input_files, task_opts, task_dir, max_nproc, max_nchunks, to_resources_func, to_resolve_files_func) except Exception as e: slog.error("Failed to convert metatask {i} to task. {m}".format(i=tnode.meta_task.task_id, m=e.message)) raise # log.debug(task) bg.node[tnode]['nproc'] = task.nproc if not has_available_slots(task.nproc): # not enough slots to run in continue bg.node[tnode]['task'] = task tnode_to_task[tnode] = task if isinstance(tnode.meta_task, (ToolContractMetaTask, ScatterToolContractMetaTask, GatherToolContractMetaTask)): # the task.options have actually already been resolved here, but using this other # code path for clarity if isinstance(tnode.meta_task, ToolContractMetaTask): rtc = IO.static_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc) elif isinstance(tnode.meta_task, ScatterToolContractMetaTask): rtc = IO.static_scatter_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc, max_nchunks, tnode.meta_task.chunk_keys) elif isinstance(tnode.meta_task, GatherToolContractMetaTask): # this should always be a TaskGatherBindingNode which will have a .chunk_key rtc = IO.static_gather_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc, tnode.chunk_key) else: raise TypeError("Unsupported task type {t}".format(t=tnode.meta_task)) # write driver manifest, which calls the resolved-tool-contract.json # there's too many layers of indirection here. Partly due to the pre-tool-contract era # python defined tasks. # Always write the RTC json for debugging purposes tc_path = os.path.join(task_dir, GlobalConstants.TOOL_CONTRACT_JSON) write_tool_contract(tnode.meta_task.tool_contract, tc_path) rtc_json_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_JSON) rtc_avro_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_AVRO) if rtc.driver.serialization == 'avro': # hack to fix command task.cmds[0] = task.cmds[0].replace('.json', '.avro') write_resolved_tool_contract_avro(rtc, rtc_avro_path) # for debugging write_resolved_tool_contract(rtc, rtc_json_path) runnable_task_path = os.path.join(task_dir, GlobalConstants.RUNNABLE_TASK_JSON) runnable_task = RunnableTask(task, global_registry.cluster_renderer) runnable_task.write_json(runnable_task_path) # Create an instance of Worker w = _to_worker(tnode.meta_task.is_distributed, "worker-task-{i}".format(i=tid), tid, runnable_task_path) workers[tid] = w w.start() total_nproc += task.nproc slog.info("Starting worker {i} ({n} workers running, {m} total proc in use)".format(i=tid, n=len(workers), m=total_nproc)) # Submit job to be run. B.update_task_state(bg, tnode, TaskStates.SUBMITTED) msg_ = "Updating task {t} to SUBMITTED".format(t=tid) log.debug(msg_) tid_to_tnode[tid] = tnode services_log_update_progress("pbsmrtpipe::{i}".format(i=tnode.idx), WS.LogLevels.INFO, msg_) BU.write_binding_graph_images(bg, job_resources.workflow) elif isinstance(tnode, EntryOutBindingFileNode): # Handle EntryPoint types. This is not a particularly elegant design :( bg.node[tnode]['nproc'] = 1 log.info("Marking task as completed {t}".format(t=tnode)) B.update_task_state_to_success(bg, tnode, 0.0) # Update output paths mock_file_index = 0 for fnode in bg.successors(tnode): file_path = "/path/to/mock-file-{i}.txt".format(i=mock_file_index) B.update_file_state_to_resolved(bg, fnode, file_path) mock_file_index += 1 else: raise TypeError("Unsupported node type {t} of '{x}'".format(t=type(tnode), x=tnode)) # Update state of any files B.resolve_successor_binding_file_path(bg) # Final call to write empty analysis reports write_analysis_report(analysis_file_links) # end of while loop _terminate_all_workers(workers.values(), shutdown_event) if has_failed: log.debug("\n" + BU.to_binding_graph_summary(bg)) was_successful = B.was_workflow_successful(bg) s_ = TaskStates.SUCCESSFUL if was_successful else TaskStates.FAILED write_report_(bg, s_, was_successful) except PipelineRuntimeKeyboardInterrupt: write_report_(bg, TaskStates.KILLED, False) write_task_summary_report(bg) BU.write_binding_graph_images(bg, job_resources.workflow) was_successful = False except Exception as e: slog.error("Unexpected error {e}. Writing reports and shutting down".format(e=e.message)) # update workflow reports to failed write_report_(bg, TaskStates.FAILED, False) write_task_summary_report(bg) services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, "Error {e}".format(e=e)) BU.write_binding_graph_images(bg, job_resources.workflow) raise finally: write_task_summary_report(bg) BU.write_binding_graph_images(bg, job_resources.workflow) return True if was_successful else False
def __exe_workflow(global_registry, ep_d, bg, task_opts, workflow_opts, output_dir, workers, shutdown_event, service_uri_or_none): """ Core runner of a workflow. :type bg: BindingsGraph :type workflow_opts: WorkflowLevelOptions :type output_dir: str :type service_uri_or_none: str | None :param workers: {taskid:Worker} :return: The function is doing way too much. This is really terrible. """ job_id = random.randint(100000, 999999) started_at = time.time() m_ = "Distributed" if workflow_opts.distributed_mode is not None else "Local" slog.info("starting to execute {m} workflow with assigned job_id {i}".format(i=job_id, m=m_)) log.info("exe'ing workflow Cluster renderer {c}".format(c=global_registry.cluster_renderer)) slog.info("Service URI: {i} {t}".format(i=service_uri_or_none, t=type(service_uri_or_none))) slog.info("Max number of Chunks {n} ".format(n=workflow_opts.max_nchunks)) slog.info("Max number of nproc {n}".format(n=workflow_opts.max_nproc)) slog.info("Max number of workers {n}".format(n=workflow_opts.max_nworkers)) # Setup logger, job directory and initialize DS slog.info("creating job resources in {o}".format(o=output_dir)) job_resources, ds = DU.job_resource_create_and_setup_logs(output_dir, bg, task_opts, workflow_opts, ep_d) slog.info("successfully created job resources.") # Some Pre-flight checks # Help initialize graph/epoints B.resolve_entry_points(bg, ep_d) # Update Task-esque EntryBindingPoint B.resolve_entry_binding_points(bg) # initialization File node attributes for eid, path in ep_d.iteritems(): B.resolve_entry_point(bg, eid, path) B.resolve_successor_binding_file_path(bg) # Mark Chunkable tasks B.label_chunkable_tasks(bg, global_registry.chunk_operators) slog.info("validating binding graph") # Check the degree of the nodes B.validate_binding_graph_integrity(bg) slog.info("successfully validated binding graph.") # Add scattered # This will add new nodes to the graph if necessary B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, workflow_opts.max_nchunks) # log.info(BU.to_binding_graph_summary(bg)) # "global" file type id counter {str: int} that will be # used to generate ids file_type_id_to_count = {file_type.file_type_id: 0 for _, file_type in global_registry.file_types.iteritems()} # Create a closure to allow file types to assign unique id # this returns a func to_resolve_files_func = B.to_resolve_files(file_type_id_to_count) # Local vars max_total_nproc = workflow_opts.total_max_nproc max_nworkers = workflow_opts.max_nworkers max_nproc = workflow_opts.max_nproc max_nchunks = workflow_opts.max_nchunks q_out = multiprocessing.Queue() worker_sleep_time = 1 # To store all the reports that are displayed in the analysis.html # {id:task-id, report_path:path/to/report.json} analysis_file_links = [] # Flag for pipeline execution failure has_failed = False # Time to sleep between each step the execution loop # after the first 1 minute of exe, update the sleep time to 2 sec sleep_time = 1 # Running total of current number of slots/cpu's used total_nproc = 0 # Define a bunch of util funcs to try to make the main driver while loop # more understandable. Not the greatest model. def _to_run_time(): return time.time() - started_at def write_analysis_report(analysis_file_links_): analysis_report_html = os.path.join(job_resources.html, 'analysis.html') R.write_analysis_link_report(analysis_file_links_, analysis_report_html) def update_analysis_file_links(task_id_, report_path_): analysis_link = AnalysisLink(task_id_, report_path_) log.info("updating report analysis file links {a}".format(a=analysis_link)) analysis_file_links.append(analysis_link) write_analysis_report(analysis_file_links) # factories for getting a Worker instance # utils for getting the running func and worker type def _to_worker(w_is_distributed, wid, task_id, manifest_path_): # the IO loading will forceful set this to None # if the cluster manager not defined or cluster_mode is False if global_registry.cluster_renderer is None or not w_is_distributed: r_func = T.run_task_manifest else: r_func = T.run_task_manifest_on_cluster return TaskManifestWorker(q_out, shutdown_event, worker_sleep_time, r_func, task_id, manifest_path_, name=wid) # Define a bunch of util funcs to try to make the main driver while loop # more understandable. Not the greatest model. def write_report_(bg_, current_state_, was_successful_): return DU.write_main_workflow_report(job_id, job_resources, workflow_opts, task_opts, bg_, current_state_, was_successful_, _to_run_time()) def write_task_summary_report(bg_): task_summary_report = DU.to_task_summary_report(bg_) p = os.path.join(job_resources.html, 'task_summary.html') R.write_report_to_html(task_summary_report, p) def services_log_update_progress(source_id_, level_, message_): if service_uri_or_none is not None: total_log_uri = "{u}/log".format(u=service_uri_or_none) WS.log_pbsmrtpipe_progress(total_log_uri, message_, level_, source_id_, ignore_errors=True) def services_add_datastore_file(datastore_file_): if service_uri_or_none is not None: total_ds_uri = "{u}/datastore".format(u=service_uri_or_none) WS.add_datastore_file(total_ds_uri, datastore_file_, ignore_errors=True) def _update_analysis_reports_and_datastore(tnode_, task_): for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files): source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id) ds_uuid = _get_dataset_uuid_or_create_uuid(path_) ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_) ds.add(ds_file_) ds.write_update_json(job_resources.datastore_json) # Update Services services_add_datastore_file(ds_file_) dsr = DU.datastore_to_report(ds) R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html')) if file_type_ == FileTypes.REPORT: T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir)) update_analysis_file_links(tnode_.idx, path_) def _log_task_failure_and_call_services(path_to_stderr, task_id_): """log the error messages extracted from stderr""" lines = _get_last_lines_of_stderr(20, path_to_stderr) for line_ in lines: log.error(line_.strip()) # these already have newlines sys.stderr.write(line_) sys.stderr.write("\n") services_log_update_progress("pbsmrtpipe::{i}".format(i=task_id_), WS.LogLevels.ERROR, "\n".join(lines)) def has_available_slots(n): if max_total_nproc is None: return True return total_nproc + n <= max_total_nproc # Misc setup write_report_(bg, TaskStates.CREATED, False) # write empty analysis reports write_analysis_report(analysis_file_links) BU.write_binding_graph_images(bg, job_resources.workflow) # For book-keeping # task id -> tnode tid_to_tnode = {} # tnode -> Task instance tnode_to_task = {} # local loop for adjusting sleep time, this will get reset after each new # task is created niterations = 0 dt_ramp = 0.25 # number of iterations before switching to steady state sleep stead_state_n = 50 # sleep for 5 sec dt_stead_state = 4 try: log.debug("Starting execution loop... in process {p}".format(p=os.getpid())) # After the initial startup, bump up the time to reduce resource usage # (since multiple instances will be launched from the services) niterations += 1 if niterations < stead_state_n: sleep_time = dt_ramp else: sleep_time = dt_stead_state while True: # Convert Task -> ScatterAble task (emits a Chunk.json file) B.apply_scatterable(bg, global_registry.chunk_operators, global_registry.tasks) # This will add new TaskBinding nodes to the graph if necessary B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, max_nchunks) # B.write_binding_graph_images(bg, job_resources.workflow) # If a TaskScatteredBindingNode is completed successfully and # output chunk.json is resolved, read in the file and # generate the new chunked tasks. This mutates the graph # significantly. B.add_gather_to_completed_task_chunks(bg, global_registry.chunk_operators, global_registry.tasks, job_resources.tasks) if not _are_workers_alive(workers): for tix_, w_ in workers.iteritems(): if not w_.is_alive(): log.warn("Worker {i} (pid {p}) is not alive for task {x}. Worker exit code {e}.".format(i=w_.name, p=w_.pid, e=w_.exitcode, x=tix_)) #w_.terminate() # Check if Any tasks are running or that there still runnable tasks is_completed = bg.is_workflow_complete() has_task_running = B.has_running_task(bg) if not has_task_running: if not is_completed: if not B.has_task_in_states(bg, TaskStates.RUNNABLE_STATES()): if not B.has_next_runnable_task(bg): msg = "Unable to find runnable task or any tasks running and workflow is NOT completed." log.error(msg) log.error(BU.to_binding_graph_summary(bg)) services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, msg) raise PipelineRuntimeError(msg) time.sleep(sleep_time) # log.debug("Sleeping for {s}".format(s=sleep_time)) log.debug("\n" + BU.to_binding_graph_summary(bg)) # BU.to_binding_graph_task_summary(bg) # This should only be triggered after events. The main reason # to keep updating it was the html report is up to date with the # runtime write_report_(bg, TaskStates.RUNNING, is_completed) if is_completed: msg_ = "Workflow is completed. breaking out." log.info(msg_) services_log_update_progress("pbsmrtpipe", WS.LogLevels.INFO, msg_) break try: result = q_out.get_nowait() except Queue.Empty: result = None # log.info("Results {r}".format(r=result)) if isinstance(result, TaskResult): niterations = 0 log.debug("Task result {r}".format(r=result)) tid_, state_, msg_, run_time_ = result tnode_ = tid_to_tnode[tid_] task_ = tnode_to_task[tnode_] # Process Successful Task Result if state_ == TaskStates.SUCCESSFUL: msg_ = "Task was successful {r}".format(r=result) slog.info(msg_) # this will raise if a task output is failed to be resolved B.validate_outputs_and_update_task_to_success(bg, tnode_, run_time_, bg.node[tnode_]['task'].output_files) slog.info("Successfully validated outputs of {t}".format(t=repr(tnode_))) services_log_update_progress("pbsmrtpipe::{i}".format(i=tid_), WS.LogLevels.INFO, msg_) B.update_task_output_file_nodes(bg, tnode_, tnode_to_task[tnode_]) B.resolve_successor_binding_file_path(bg) total_nproc -= task_.nproc w_ = workers.pop(tid_) _terminate_worker(w_) # Update Analysis Reports and Register output files to Datastore _update_analysis_reports_and_datastore(tnode_, task_) BU.write_binding_graph_images(bg, job_resources.workflow) else: # Process Non-Successful Task Result B.update_task_state(bg, tnode_, state_) msg_ = "Task was not successful {r}".format(r=result) slog.error(msg_) log.error(msg_ + "\n") sys.stderr.write(msg_ + "\n") stderr_ = os.path.join(task_.output_dir, "stderr") _log_task_failure_and_call_services(stderr_, tid_) # let the remaining running jobs continue w_ = workers.pop(tid_) _terminate_worker(w_) total_nproc -= task_.nproc has_failed = True BU.write_binding_graph_images(bg, job_resources.workflow) _update_msg = _status(bg) log.info(_update_msg) slog.info(_update_msg) s_ = TaskStates.FAILED if has_failed else TaskStates.RUNNING write_report_(bg, s_, False) write_task_summary_report(bg) elif isinstance(result, types.NoneType): pass else: log.error("Unexpected queue result type {t} {r}".format(t=type(result), r=result)) if has_failed: log.error("job has failed. breaking out.") # Just kill everything break # Computational resources are tapped if len(workers) >= max_nworkers: # don't do anything continue tnode = B.get_next_runnable_task(bg) if tnode is None: continue elif isinstance(tnode, TaskBindingNode): niterations = 0 # Found a Runnable Task # base task_id-instance_id tid = '-'.join([tnode.meta_task.task_id, str(tnode.instance_id)]) task_dir = os.path.join(job_resources.tasks, tid) if not os.path.exists(task_dir): os.mkdir(task_dir) to_resources_func = B.to_resolve_di_resources(task_dir, root_tmp_dir=workflow_opts.tmp_dir) input_files = B.get_task_input_files(bg, tnode) # convert metatask -> task try: task = GX.meta_task_to_task(tnode.meta_task, input_files, task_opts, task_dir, max_nproc, max_nchunks, to_resources_func, to_resolve_files_func) except Exception as e: slog.error("Failed to convert metatask {i} to task. {m}".format(i=tnode.meta_task.task_id, m=e.message)) raise # log.debug(task) bg.node[tnode]['nproc'] = task.nproc if not has_available_slots(task.nproc): # not enough slots to run in continue bg.node[tnode]['task'] = task tnode_to_task[tnode] = task if isinstance(tnode.meta_task, (ToolContractMetaTask, ScatterToolContractMetaTask, GatherToolContractMetaTask)): # the task.options have actually already been resolved here, but using this other # code path for clarity if isinstance(tnode.meta_task, ToolContractMetaTask): rtc = IO.static_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc) elif isinstance(tnode.meta_task, ScatterToolContractMetaTask): rtc = IO.static_scatter_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc, max_nchunks, tnode.meta_task.chunk_keys) elif isinstance(tnode.meta_task, GatherToolContractMetaTask): # this should always be a TaskGatherBindingNode which will have a .chunk_key rtc = IO.static_gather_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc, tnode.chunk_key) else: raise TypeError("Unsupported task type {t}".format(t=tnode.meta_task)) # write driver manifest, which calls the resolved-tool-contract.json # there's too many layers of indirection here. Partly due to the pre-tool-contract era # python defined tasks. # Always write the RTC json for debugging purposes tc_path = os.path.join(task_dir, GlobalConstants.TOOL_CONTRACT_JSON) write_tool_contract(tnode.meta_task.tool_contract, tc_path) rtc_json_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_JSON) rtc_avro_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_AVRO) if rtc.driver.serialization == 'avro': # hack to fix command task.cmds[0] = task.cmds[0].replace('.json', '.avro') write_resolved_tool_contract_avro(rtc, rtc_avro_path) # for debugging write_resolved_tool_contract(rtc, rtc_json_path) runnable_task_path = os.path.join(task_dir, GlobalConstants.RUNNABLE_TASK_JSON) runnable_task = RunnableTask(task, global_registry.cluster_renderer) runnable_task.write_json(runnable_task_path) # Create an instance of Worker w = _to_worker(tnode.meta_task.is_distributed, "worker-task-{i}".format(i=tid), tid, runnable_task_path) workers[tid] = w w.start() total_nproc += task.nproc slog.info("Starting worker {i} ({n} workers running, {m} total proc in use)".format(i=tid, n=len(workers), m=total_nproc)) # Submit job to be run. B.update_task_state(bg, tnode, TaskStates.SUBMITTED) msg_ = "Updating task {t} to SUBMITTED".format(t=tid) log.debug(msg_) tid_to_tnode[tid] = tnode services_log_update_progress("pbsmrtpipe::{i}".format(i=tnode.idx), WS.LogLevels.INFO, msg_) BU.write_binding_graph_images(bg, job_resources.workflow) elif isinstance(tnode, EntryOutBindingFileNode): # Handle EntryPoint types. This is not a particularly elegant design :( bg.node[tnode]['nproc'] = 1 log.info("Marking task as completed {t}".format(t=tnode)) B.update_task_state_to_success(bg, tnode, 0.0) # Update output paths mock_file_index = 0 for fnode in bg.successors(tnode): file_path = "/path/to/mock-file-{i}.txt".format(i=mock_file_index) B.update_file_state_to_resolved(bg, fnode, file_path) mock_file_index += 1 else: raise TypeError("Unsupported node type {t} of '{x}'".format(t=type(tnode), x=tnode)) # Update state of any files B.resolve_successor_binding_file_path(bg) # Final call to write empty analysis reports write_analysis_report(analysis_file_links) # end of while loop _terminate_all_workers(workers.values(), shutdown_event) if has_failed: log.debug("\n" + BU.to_binding_graph_summary(bg)) was_successful = B.was_workflow_successful(bg) s_ = TaskStates.SUCCESSFUL if was_successful else TaskStates.FAILED write_report_(bg, s_, was_successful) except PipelineRuntimeKeyboardInterrupt: write_report_(bg, TaskStates.KILLED, False) write_task_summary_report(bg) BU.write_binding_graph_images(bg, job_resources.workflow) was_successful = False except Exception as e: slog.error("Unexpected error {e}. Writing reports and shutting down".format(e=e.message)) # update workflow reports to failed write_report_(bg, TaskStates.FAILED, False) write_task_summary_report(bg) services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, "Error {e}".format(e=e)) BU.write_binding_graph_images(bg, job_resources.workflow) raise finally: write_task_summary_report(bg) BU.write_binding_graph_images(bg, job_resources.workflow) return True if was_successful else False