Ejemplo n.º 1
0
    def test_all_sane(self):
        """Test that all pipelines are well defined"""
        errors = []
        rtasks, rfiles_types, chunk_operators, pipelines = L.load_all()

        for pipeline_id, pipeline in pipelines.items():
            emsg = "Pipeline {p} is not valid.".format(p=pipeline_id)
            log.debug("Checking Sanity of registered Pipeline {i}".format(
                i=pipeline_id))
            log.info(pipeline_id)
            log.debug(pipeline)
            try:
                # Validate with Avro
                d = pipeline_template_to_dict(pipeline, rtasks)
                _ = validate_pipeline_template(d)
                name = pipeline_id + "_pipeline_template.avro"
                output_file = get_temp_file(suffix=name)
                log.info(
                    "{p} converted to avro successfully".format(p=pipeline_id))

                bg = BU.binding_strs_to_binding_graph(rtasks,
                                                      pipeline.all_bindings)
                BU.validate_binding_graph_integrity(bg)
                BU.validate_compatible_binding_file_types(bg)
                validate_entry_points(d)
                # pprint.pprint(d)

                # for debugging purposes
                output_json = output_file.replace(".avro", '.json')
                log.info("writing pipeline to {p}".format(p=output_json))
                with open(output_json, 'w') as j:
                    j.write(json.dumps(d, sort_keys=True, indent=4))

                log.info(
                    "writing pipeline template to {o}".format(o=output_file))

                # Test writing to avro if the pipeline is actually valid
                write_pipeline_template_to_avro(pipeline, rtasks, output_file)
                log.info("Pipeline {p} is valid.".format(p=pipeline_id))

                log.info("Loading avro {i} from {p}".format(i=pipeline_id,
                                                            p=output_file))
                pipeline_d = load_pipeline_template_from_avro(output_file)
                self.assertIsInstance(pipeline_d, dict)

            except Exception as e:
                m = emsg + " Error: " + e.message
                log.error(m)
                errors.append(m)
                log.error(emsg)
                log.error(e)

        msg = "\n".join(errors) if errors else ""
        self.assertEqual([], errors, msg)
Ejemplo n.º 2
0
def registry_runner(registry_, rtasks, output_dir, emit_xml=False):
    """

    :type registry_: PipelineRegistry
    :param rtasks:
    :type output_dir: str
    :type emit_xml: bool

    :rtype: int
    """
    # this will emit the PTs to an output dir

    import pbsmrtpipe.pb_io as IO
    # get around circular imports
    import pbsmrtpipe.graph.bgraph as B

    r = registry_

    log.info("Validating pipelines")
    # Validating pipeline integrity
    for i, p in r.pipelines.iteritems():
        log.debug("Validating pipeline {}".format(i))
        bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings))
        # or this will raise
        is_valid = B.validate_binding_graph_integrity(bg)
        log.info("Pipeline {} is valid? {}".format(i, is_valid))

    # Make the dir if it' doesn't exist
    output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir))

    log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir,
                                                            x=len(
                                                                r.pipelines)))
    print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir)

    IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks,
                                        output_pipeline_dir)

    for p in r.pipelines.values():
        if emit_xml:
            file_name = p.idx + "_pipeline.xml"
            path = os.path.join(output_dir, file_name)
            xml = IO.pipeline_to_xml(p)
            with open(path, 'w') as f:
                f.write(str(xml))
            log.info("writing pipeline {x}".format(x=path))

    _d = dict(n=len(r.pipelines),
              d=output_pipeline_dir,
              x=len(r.original_pipeline_ids),
              a=len(r.all_pipelines))
    log.info(
        "Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}"
        .format(**_d))
    return 0
Ejemplo n.º 3
0
def registry_runner(registry_, rtasks, output_dir, emit_xml=False):
    """

    :type registry_: PipelineRegistry
    :param rtasks:
    :type output_dir: str
    :type emit_xml: bool

    :rtype: int
    """
    # this will emit the PTs to an output dir

    import pbsmrtpipe.pb_io as IO
    # get around circular imports
    import pbsmrtpipe.graph.bgraph as B

    r = registry_

    log.info("Validating pipelines")
    # Validating pipeline integrity
    for i, p in r.pipelines.iteritems():
        log.debug("Validating pipeline {}".format(i))
        bg = B.binding_strs_to_binding_graph(rtasks, list(p.all_bindings))
        # or this will raise
        is_valid = B.validate_binding_graph_integrity(bg)
        log.info("Pipeline {} is valid? {}".format(i, is_valid))

    # Make the dir if it' doesn't exist
    output_pipeline_dir = os.path.abspath(os.path.expanduser(output_dir))

    log.info("Writing {x} Pipeline Templates to {o}".format(o=output_dir, x=len(r.pipelines)))
    print "Emitting pipelines to output dir {d}".format(d=output_pipeline_dir)

    IO.write_pipeline_templates_to_json(r.pipelines.values(), rtasks, output_pipeline_dir)

    for p in r.pipelines.values():
        if emit_xml:
            file_name = p.idx + "_pipeline.xml"
            path = os.path.join(output_dir, file_name)
            xml = IO.pipeline_to_xml(p)
            with open(path, 'w') as f:
                f.write(str(xml))
            log.info("writing pipeline {x}".format(x=path))

    _d = dict(n=len(r.pipelines),
              d=output_pipeline_dir,
              x=len(r.original_pipeline_ids),
              a=len(r.all_pipelines))
    log.info("Successfully wrote {n} new pipelines (previously loaded {x} all pipelines {a} to {d}".format(**_d))
    return 0
Ejemplo n.º 4
0
def __exe_workflow(global_registry, ep_d, bg, task_opts, workflow_opts, output_dir,
                   workers, shutdown_event, service_uri_or_none):
    """
    Core runner of a workflow.

    :type bg: BindingsGraph
    :type workflow_opts: WorkflowLevelOptions
    :type output_dir: str
    :type service_uri_or_none: str | None

    :param workers: {taskid:Worker}
    :return:

    The function is doing way too much. This is really terrible.
    """
    job_id = random.randint(100000, 999999)
    started_at = time.time()

    m_ = "Distributed" if workflow_opts.distributed_mode is not None else "Local"

    slog.info("starting to execute {m} workflow with assigned job_id {i}".format(i=job_id, m=m_))
    log.info("exe'ing workflow Cluster renderer {c}".format(c=global_registry.cluster_renderer))
    slog.info("Service URI: {i} {t}".format(i=service_uri_or_none, t=type(service_uri_or_none)))
    slog.info("Max number of Chunks  {n} ".format(n=workflow_opts.max_nchunks))
    slog.info("Max number of nproc   {n}".format(n=workflow_opts.max_nproc))
    slog.info("Max number of workers {n}".format(n=workflow_opts.max_nworkers))

    # Setup logger, job directory and initialize DS
    slog.info("creating job resources in {o}".format(o=output_dir))
    job_resources, ds = DU.job_resource_create_and_setup_logs(output_dir, bg, task_opts, workflow_opts, ep_d)
    slog.info("successfully created job resources.")

    # Some Pre-flight checks
    # Help initialize graph/epoints
    B.resolve_entry_points(bg, ep_d)
    # Update Task-esque EntryBindingPoint
    B.resolve_entry_binding_points(bg)

    # initialization File node attributes
    for eid, path in ep_d.iteritems():
        B.resolve_entry_point(bg, eid, path)
        B.resolve_successor_binding_file_path(bg)

    # Mark Chunkable tasks
    B.label_chunkable_tasks(bg, global_registry.chunk_operators)

    slog.info("validating binding graph")
    # Check the degree of the nodes
    B.validate_binding_graph_integrity(bg)
    slog.info("successfully validated binding graph.")

    # Add scattered
    # This will add new nodes to the graph if necessary
    B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, workflow_opts.max_nchunks)

    # log.info(BU.to_binding_graph_summary(bg))

    # "global" file type id counter {str: int} that will be
    # used to generate ids
    file_type_id_to_count = {file_type.file_type_id: 0 for _, file_type in global_registry.file_types.iteritems()}

    # Create a closure to allow file types to assign unique id
    # this returns a func
    to_resolve_files_func = B.to_resolve_files(file_type_id_to_count)

    # Local vars
    max_total_nproc = workflow_opts.total_max_nproc
    max_nworkers = workflow_opts.max_nworkers
    max_nproc = workflow_opts.max_nproc
    max_nchunks = workflow_opts.max_nchunks
    tmp_dir = workflow_opts.tmp_dir

    q_out = multiprocessing.Queue()

    worker_sleep_time = 1
    # To store all the reports that are displayed in the analysis.html
    # {id:task-id, report_path:path/to/report.json}
    analysis_file_links = []

    # Flag for pipeline execution failure
    has_failed = False
    # Time to sleep between each step the execution loop
    # after the first 1 minute of exe, update the sleep time to 2 sec
    sleep_time = 1
    # Running total of current number of slots/cpu's used
    total_nproc = 0

    # Define a bunch of util funcs to try to make the main driver while loop
    # more understandable. Not the greatest model.

    def _to_run_time():
        return time.time() - started_at

    def write_analysis_report(analysis_file_links_):
        analysis_report_html = os.path.join(job_resources.html, 'analysis.html')
        R.write_analysis_link_report(analysis_file_links_, analysis_report_html)

    def update_analysis_file_links(task_id_, report_path_):
        analysis_link = AnalysisLink(task_id_, report_path_)
        log.info("updating report analysis file links {a}".format(a=analysis_link))
        analysis_file_links.append(analysis_link)
        write_analysis_report(analysis_file_links)

    # factories for getting a Worker instance
    # utils for getting the running func and worker type
    def _to_worker(w_is_distributed, wid, task_id, manifest_path_):
        # the IO loading will forceful set this to None
        # if the cluster manager not defined or cluster_mode is False
        if global_registry.cluster_renderer is None or not w_is_distributed:
            r_func = T.run_task_manifest
        else:
            r_func = T.run_task_manifest_on_cluster
        return TaskManifestWorker(q_out, shutdown_event, worker_sleep_time,
                                  r_func, task_id, manifest_path_, name=wid)

    # Define a bunch of util funcs to try to make the main driver while loop
    # more understandable. Not the greatest model.
    def write_report_(bg_, current_state_, was_successful_):
        return DU.write_main_workflow_report(job_id, job_resources, workflow_opts,
                                             task_opts, bg_, current_state_, was_successful_, _to_run_time())

    def write_task_summary_report(bg_):
        task_summary_report = DU.to_task_summary_report(bg_)
        p = os.path.join(job_resources.html, 'task_summary.html')
        R.write_report_to_html(task_summary_report, p)

    def services_log_update_progress(source_id_, level_, message_):
        if service_uri_or_none is not None:
            total_log_uri = "{u}/log".format(u=service_uri_or_none)
            WS.log_pbsmrtpipe_progress(total_log_uri, message_, level_, source_id_, ignore_errors=True)

    def services_add_datastore_file(datastore_file_):
        if service_uri_or_none is not None:
            total_ds_uri = "{u}/datastore".format(u=service_uri_or_none)
            WS.add_datastore_file(total_ds_uri, datastore_file_, ignore_errors=True)

    def _update_analysis_reports_and_datastore(tnode_, task_):
        for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files):
            source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id)
            ds_uuid = _get_dataset_uuid_or_create_uuid(path_)
            ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_)
            ds.add(ds_file_)
            ds.write_update_json(job_resources.datastore_json)

            # Update Services
            services_add_datastore_file(ds_file_)

            dsr = DU.datastore_to_report(ds)
            R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html'))
            if file_type_ == FileTypes.REPORT:
                T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir))
                update_analysis_file_links(tnode_.idx, path_)

    def _log_task_failure_and_call_services(path_to_stderr, task_id_):
        """log the error messages extracted from stderr"""
        lines = _get_last_lines_of_stderr(20, path_to_stderr)
        for line_ in lines:
            log.error(line_.strip())
            # these already have newlines
            sys.stderr.write(line_)
            sys.stderr.write("\n")
        services_log_update_progress("pbsmrtpipe::{i}".format(i=task_id_), WS.LogLevels.ERROR, "\n".join(lines))

    def has_available_slots(n):
        if max_total_nproc is None:
            return True
        return total_nproc + n <= max_total_nproc

    # Misc setup
    write_report_(bg, TaskStates.CREATED, False)
    # write empty analysis reports
    write_analysis_report(analysis_file_links)

    BU.write_binding_graph_images(bg, job_resources.workflow)

    # For book-keeping
    # task id -> tnode
    tid_to_tnode = {}
    # tnode -> Task instance
    tnode_to_task = {}

    # local loop for adjusting sleep time, this will get reset after each new
    # task is created
    niterations = 0
    dt_ramp = 0.25
    # number of iterations before switching to steady state sleep
    stead_state_n = 50
    # sleep for 5 sec
    dt_stead_state = 4
    try:
        log.debug("Starting execution loop... in process {p}".format(p=os.getpid()))

        # After the initial startup, bump up the time to reduce resource usage
        # (since multiple instances will be launched from the services)
        niterations += 1
        if niterations < stead_state_n:
            sleep_time = dt_ramp
        else:
            sleep_time = dt_stead_state

        while True:

            # Convert Task -> ScatterAble task (emits a Chunk.json file)
            B.apply_scatterable(bg, global_registry.chunk_operators, global_registry.tasks)

            # This will add new TaskBinding nodes to the graph if necessary
            B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, max_nchunks)
            # B.write_binding_graph_images(bg, job_resources.workflow)
            # If a TaskScatteredBindingNode is completed successfully and
            # output chunk.json is resolved, read in the file and
            # generate the new chunked tasks. This mutates the graph
            # significantly.
            B.add_gather_to_completed_task_chunks(bg, global_registry.chunk_operators, global_registry.tasks, job_resources.tasks)

            if not _are_workers_alive(workers):
                for tix_, w_ in workers.iteritems():
                    if not w_.is_alive():
                        log.warn("Worker {i} (pid {p}) is not alive for task {x}. Worker exit code {e}.".format(i=w_.name, p=w_.pid, e=w_.exitcode, x=tix_))
                        #w_.terminate()

            # Check if Any tasks are running or that there still runnable tasks
            is_completed = bg.is_workflow_complete()
            has_task_running = B.has_running_task(bg)

            if not has_task_running:
                if not is_completed:
                    if not B.has_task_in_states(bg, TaskStates.RUNNABLE_STATES()):
                        if not B.has_next_runnable_task(bg):
                            msg = "Unable to find runnable task or any tasks running and workflow is NOT completed."
                            log.error(msg)
                            log.error(BU.to_binding_graph_summary(bg))
                            services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, msg)
                            raise PipelineRuntimeError(msg)

            time.sleep(sleep_time)

            # log.debug("Sleeping for {s}".format(s=sleep_time))
            log.debug("\n" + BU.to_binding_graph_summary(bg))
            # BU.to_binding_graph_task_summary(bg)

            # This should only be triggered after events. The main reason
            # to keep updating it was the html report is up to date with the
            # runtime
            write_report_(bg, TaskStates.RUNNING, is_completed)

            if is_completed:
                msg_ = "Workflow is completed. breaking out."
                log.info(msg_)
                services_log_update_progress("pbsmrtpipe", WS.LogLevels.INFO, msg_)
                break

            try:
                result = q_out.get_nowait()
            except Queue.Empty:
                result = None

            # log.info("Results {r}".format(r=result))
            if isinstance(result, TaskResult):
                niterations = 0
                log.debug("Task result {r}".format(r=result))

                tid_, state_, msg_, run_time_ = result
                tnode_ = tid_to_tnode[tid_]
                task_ = tnode_to_task[tnode_]

                # Process Successful Task Result
                if state_ == TaskStates.SUCCESSFUL:
                    msg_ = "Task was successful {r}".format(r=result)
                    slog.info(msg_)

                    # this will raise if a task output is failed to be resolved
                    B.validate_outputs_and_update_task_to_success(bg, tnode_, run_time_, bg.node[tnode_]['task'].output_files)
                    slog.info("Successfully validated outputs of {t}".format(t=repr(tnode_)))

                    services_log_update_progress("pbsmrtpipe::{i}".format(i=tid_), WS.LogLevels.INFO, msg_)
                    B.update_task_output_file_nodes(bg, tnode_, tnode_to_task[tnode_])
                    B.resolve_successor_binding_file_path(bg)

                    total_nproc -= task_.nproc
                    w_ = workers.pop(tid_)
                    _terminate_worker(w_)

                    # Update Analysis Reports and Register output files to Datastore
                    _update_analysis_reports_and_datastore(tnode_, task_)

                    BU.write_binding_graph_images(bg, job_resources.workflow)
                else:
                    # Process Non-Successful Task Result
                    B.update_task_state(bg, tnode_, state_)
                    slog.error(result.error_message)
                    log.error(result.error_message + "\n")
                    sys.stderr.write(result.error_message + "\n")

                    _log_task_failure_and_call_services(result.error_message, tid_)

                    # let the remaining running jobs continue
                    w_ = workers.pop(tid_)
                    _terminate_worker(w_)

                    total_nproc -= task_.nproc
                    has_failed = True

                    BU.write_binding_graph_images(bg, job_resources.workflow)

                _update_msg = _status(bg)
                log.info(_update_msg)
                slog.info(_update_msg)

                s_ = TaskStates.FAILED if has_failed else TaskStates.RUNNING

                write_report_(bg, s_, False)
                write_task_summary_report(bg)

            elif isinstance(result, types.NoneType):
                pass
            else:
                log.error("Unexpected queue result type {t} {r}".format(t=type(result), r=result))

            if has_failed:
                log.error("job has failed. breaking out.")
                # Just kill everything
                break

            # Computational resources are tapped
            if len(workers) >= max_nworkers:
                # don't do anything
                continue

            tnode = B.get_next_runnable_task(bg)

            if tnode is None:
                continue
            elif isinstance(tnode, TaskBindingNode):
                niterations = 0
                # Found a Runnable Task

                # base task_id-instance_id
                tid = '-'.join([tnode.meta_task.task_id, str(tnode.instance_id)])

                task_dir = os.path.join(job_resources.tasks, tid)
                if not os.path.exists(task_dir):
                    os.mkdir(task_dir)

                to_resources_func = B.to_resolve_di_resources(task_dir, root_tmp_dir=workflow_opts.tmp_dir)
                input_files = B.get_task_input_files(bg, tnode)

                # convert metatask -> task
                try:
                    task = GX.meta_task_to_task(tnode.meta_task, input_files, task_opts, task_dir, max_nproc, max_nchunks,
                                                to_resources_func, to_resolve_files_func)
                except Exception as e:
                    slog.error("Failed to convert metatask {i} to task. {m}".format(i=tnode.meta_task.task_id, m=e.message))
                    raise

                # log.debug(task)

                bg.node[tnode]['nproc'] = task.nproc

                if not has_available_slots(task.nproc):
                    # not enough slots to run in
                    continue

                bg.node[tnode]['task'] = task
                tnode_to_task[tnode] = task

                if isinstance(tnode.meta_task, (ToolContractMetaTask, ScatterToolContractMetaTask, GatherToolContractMetaTask)):
                    # the task.options have actually already been resolved here, but using this other
                    # code path for clarity
                    if isinstance(tnode.meta_task, ToolContractMetaTask):
                        rtc = IO.static_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc)
                    elif isinstance(tnode.meta_task, ScatterToolContractMetaTask):
                        rtc = IO.static_scatter_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc, max_nchunks, tnode.meta_task.chunk_keys)
                    elif isinstance(tnode.meta_task, GatherToolContractMetaTask):
                        # this should always be a TaskGatherBindingNode which will have a .chunk_key
                        rtc = IO.static_gather_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, tmp_dir, max_nproc, tnode.chunk_key)
                    else:
                        raise TypeError("Unsupported task type {t}".format(t=tnode.meta_task))

                    # write driver manifest, which calls the resolved-tool-contract.json
                    # there's too many layers of indirection here. Partly due to the pre-tool-contract era
                    # python defined tasks.
                    # Always write the RTC json for debugging purposes
                    tc_path = os.path.join(task_dir, GlobalConstants.TOOL_CONTRACT_JSON)
                    write_tool_contract(tnode.meta_task.tool_contract, tc_path)

                    rtc_json_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_JSON)
                    rtc_avro_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_AVRO)
                    if rtc.driver.serialization == 'avro':
                        # hack to fix command
                        task.cmds[0] = task.cmds[0].replace('.json', '.avro')
                        write_resolved_tool_contract_avro(rtc, rtc_avro_path)
                    # for debugging
                    write_resolved_tool_contract(rtc, rtc_json_path)

                runnable_task_path = os.path.join(task_dir, GlobalConstants.RUNNABLE_TASK_JSON)
                runnable_task = RunnableTask(task, global_registry.cluster_renderer)
                runnable_task.write_json(runnable_task_path)

                # Create an instance of Worker
                w = _to_worker(tnode.meta_task.is_distributed, "worker-task-{i}".format(i=tid), tid, runnable_task_path)

                workers[tid] = w
                w.start()
                total_nproc += task.nproc
                slog.info("Starting worker {i} ({n} workers running, {m} total proc in use)".format(i=tid, n=len(workers), m=total_nproc))

                # Submit job to be run.
                B.update_task_state(bg, tnode, TaskStates.SUBMITTED)
                msg_ = "Updating task {t} to SUBMITTED".format(t=tid)
                log.debug(msg_)
                tid_to_tnode[tid] = tnode
                services_log_update_progress("pbsmrtpipe::{i}".format(i=tnode.idx), WS.LogLevels.INFO, msg_)
                BU.write_binding_graph_images(bg, job_resources.workflow)

            elif isinstance(tnode, EntryOutBindingFileNode):
                # Handle EntryPoint types. This is not a particularly elegant design :(
                bg.node[tnode]['nproc'] = 1
                log.info("Marking task as completed {t}".format(t=tnode))
                B.update_task_state_to_success(bg, tnode, 0.0)
                # Update output paths
                mock_file_index = 0
                for fnode in bg.successors(tnode):
                    file_path = "/path/to/mock-file-{i}.txt".format(i=mock_file_index)
                    B.update_file_state_to_resolved(bg, fnode, file_path)
                    mock_file_index += 1
            else:
                raise TypeError("Unsupported node type {t} of '{x}'".format(t=type(tnode), x=tnode))

            # Update state of any files
            B.resolve_successor_binding_file_path(bg)
            # Final call to write empty analysis reports
            write_analysis_report(analysis_file_links)

        # end of while loop
        _terminate_all_workers(workers.values(), shutdown_event)

        if has_failed:
            log.debug("\n" + BU.to_binding_graph_summary(bg))

        was_successful = B.was_workflow_successful(bg)
        s_ = TaskStates.SUCCESSFUL if was_successful else TaskStates.FAILED
        write_report_(bg, s_, was_successful)

    except PipelineRuntimeKeyboardInterrupt:
        write_report_(bg, TaskStates.KILLED, False)
        write_task_summary_report(bg)
        BU.write_binding_graph_images(bg, job_resources.workflow)
        was_successful = False

    except Exception as e:
        slog.error("Unexpected error {e}. Writing reports and shutting down".format(e=e.message))
        # update workflow reports to failed
        write_report_(bg, TaskStates.FAILED, False)
        write_task_summary_report(bg)
        services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, "Error {e}".format(e=e))
        BU.write_binding_graph_images(bg, job_resources.workflow)
        raise

    finally:
        write_task_summary_report(bg)
        BU.write_binding_graph_images(bg, job_resources.workflow)

    return True if was_successful else False
Ejemplo n.º 5
0
def __exe_workflow(global_registry, ep_d, bg, task_opts, workflow_opts, output_dir,
                   workers, shutdown_event, service_uri_or_none):
    """
    Core runner of a workflow.

    :type bg: BindingsGraph
    :type workflow_opts: WorkflowLevelOptions
    :type output_dir: str
    :type service_uri_or_none: str | None

    :param workers: {taskid:Worker}
    :return:

    The function is doing way too much. This is really terrible.
    """
    job_id = random.randint(100000, 999999)
    started_at = time.time()

    m_ = "Distributed" if workflow_opts.distributed_mode is not None else "Local"

    slog.info("starting to execute {m} workflow with assigned job_id {i}".format(i=job_id, m=m_))
    log.info("exe'ing workflow Cluster renderer {c}".format(c=global_registry.cluster_renderer))
    slog.info("Service URI: {i} {t}".format(i=service_uri_or_none, t=type(service_uri_or_none)))
    slog.info("Max number of Chunks  {n} ".format(n=workflow_opts.max_nchunks))
    slog.info("Max number of nproc   {n}".format(n=workflow_opts.max_nproc))
    slog.info("Max number of workers {n}".format(n=workflow_opts.max_nworkers))

    # Setup logger, job directory and initialize DS
    slog.info("creating job resources in {o}".format(o=output_dir))
    job_resources, ds = DU.job_resource_create_and_setup_logs(output_dir, bg, task_opts, workflow_opts, ep_d)
    slog.info("successfully created job resources.")

    # Some Pre-flight checks
    # Help initialize graph/epoints
    B.resolve_entry_points(bg, ep_d)
    # Update Task-esque EntryBindingPoint
    B.resolve_entry_binding_points(bg)

    # initialization File node attributes
    for eid, path in ep_d.iteritems():
        B.resolve_entry_point(bg, eid, path)
        B.resolve_successor_binding_file_path(bg)

    # Mark Chunkable tasks
    B.label_chunkable_tasks(bg, global_registry.chunk_operators)

    slog.info("validating binding graph")
    # Check the degree of the nodes
    B.validate_binding_graph_integrity(bg)
    slog.info("successfully validated binding graph.")

    # Add scattered
    # This will add new nodes to the graph if necessary
    B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, workflow_opts.max_nchunks)

    # log.info(BU.to_binding_graph_summary(bg))

    # "global" file type id counter {str: int} that will be
    # used to generate ids
    file_type_id_to_count = {file_type.file_type_id: 0 for _, file_type in global_registry.file_types.iteritems()}

    # Create a closure to allow file types to assign unique id
    # this returns a func
    to_resolve_files_func = B.to_resolve_files(file_type_id_to_count)

    # Local vars
    max_total_nproc = workflow_opts.total_max_nproc
    max_nworkers = workflow_opts.max_nworkers
    max_nproc = workflow_opts.max_nproc
    max_nchunks = workflow_opts.max_nchunks

    q_out = multiprocessing.Queue()

    worker_sleep_time = 1
    # To store all the reports that are displayed in the analysis.html
    # {id:task-id, report_path:path/to/report.json}
    analysis_file_links = []

    # Flag for pipeline execution failure
    has_failed = False
    # Time to sleep between each step the execution loop
    # after the first 1 minute of exe, update the sleep time to 2 sec
    sleep_time = 1
    # Running total of current number of slots/cpu's used
    total_nproc = 0

    # Define a bunch of util funcs to try to make the main driver while loop
    # more understandable. Not the greatest model.

    def _to_run_time():
        return time.time() - started_at

    def write_analysis_report(analysis_file_links_):
        analysis_report_html = os.path.join(job_resources.html, 'analysis.html')
        R.write_analysis_link_report(analysis_file_links_, analysis_report_html)

    def update_analysis_file_links(task_id_, report_path_):
        analysis_link = AnalysisLink(task_id_, report_path_)
        log.info("updating report analysis file links {a}".format(a=analysis_link))
        analysis_file_links.append(analysis_link)
        write_analysis_report(analysis_file_links)

    # factories for getting a Worker instance
    # utils for getting the running func and worker type
    def _to_worker(w_is_distributed, wid, task_id, manifest_path_):
        # the IO loading will forceful set this to None
        # if the cluster manager not defined or cluster_mode is False
        if global_registry.cluster_renderer is None or not w_is_distributed:
            r_func = T.run_task_manifest
        else:
            r_func = T.run_task_manifest_on_cluster
        return TaskManifestWorker(q_out, shutdown_event, worker_sleep_time,
                                  r_func, task_id, manifest_path_, name=wid)

    # Define a bunch of util funcs to try to make the main driver while loop
    # more understandable. Not the greatest model.
    def write_report_(bg_, current_state_, was_successful_):
        return DU.write_main_workflow_report(job_id, job_resources, workflow_opts,
                                             task_opts, bg_, current_state_, was_successful_, _to_run_time())

    def write_task_summary_report(bg_):
        task_summary_report = DU.to_task_summary_report(bg_)
        p = os.path.join(job_resources.html, 'task_summary.html')
        R.write_report_to_html(task_summary_report, p)

    def services_log_update_progress(source_id_, level_, message_):
        if service_uri_or_none is not None:
            total_log_uri = "{u}/log".format(u=service_uri_or_none)
            WS.log_pbsmrtpipe_progress(total_log_uri, message_, level_, source_id_, ignore_errors=True)

    def services_add_datastore_file(datastore_file_):
        if service_uri_or_none is not None:
            total_ds_uri = "{u}/datastore".format(u=service_uri_or_none)
            WS.add_datastore_file(total_ds_uri, datastore_file_, ignore_errors=True)

    def _update_analysis_reports_and_datastore(tnode_, task_):
        for file_type_, path_ in zip(tnode_.meta_task.output_types, task_.output_files):
            source_id = "{t}-{f}".format(t=task_.task_id, f=file_type_.file_type_id)
            ds_uuid = _get_dataset_uuid_or_create_uuid(path_)
            ds_file_ = DataStoreFile(ds_uuid, source_id, file_type_.file_type_id, path_)
            ds.add(ds_file_)
            ds.write_update_json(job_resources.datastore_json)

            # Update Services
            services_add_datastore_file(ds_file_)

            dsr = DU.datastore_to_report(ds)
            R.write_report_to_html(dsr, os.path.join(job_resources.html, 'datastore.html'))
            if file_type_ == FileTypes.REPORT:
                T.write_task_report(job_resources, task_.task_id, path_, DU._get_images_in_dir(task_.output_dir))
                update_analysis_file_links(tnode_.idx, path_)

    def _log_task_failure_and_call_services(path_to_stderr, task_id_):
        """log the error messages extracted from stderr"""
        lines = _get_last_lines_of_stderr(20, path_to_stderr)
        for line_ in lines:
            log.error(line_.strip())
            # these already have newlines
            sys.stderr.write(line_)
            sys.stderr.write("\n")
        services_log_update_progress("pbsmrtpipe::{i}".format(i=task_id_), WS.LogLevels.ERROR, "\n".join(lines))

    def has_available_slots(n):
        if max_total_nproc is None:
            return True
        return total_nproc + n <= max_total_nproc

    # Misc setup
    write_report_(bg, TaskStates.CREATED, False)
    # write empty analysis reports
    write_analysis_report(analysis_file_links)

    BU.write_binding_graph_images(bg, job_resources.workflow)

    # For book-keeping
    # task id -> tnode
    tid_to_tnode = {}
    # tnode -> Task instance
    tnode_to_task = {}

    # local loop for adjusting sleep time, this will get reset after each new
    # task is created
    niterations = 0
    dt_ramp = 0.25
    # number of iterations before switching to steady state sleep
    stead_state_n = 50
    # sleep for 5 sec
    dt_stead_state = 4
    try:
        log.debug("Starting execution loop... in process {p}".format(p=os.getpid()))

        # After the initial startup, bump up the time to reduce resource usage
        # (since multiple instances will be launched from the services)
        niterations += 1
        if niterations < stead_state_n:
            sleep_time = dt_ramp
        else:
            sleep_time = dt_stead_state

        while True:

            # Convert Task -> ScatterAble task (emits a Chunk.json file)
            B.apply_scatterable(bg, global_registry.chunk_operators, global_registry.tasks)

            # This will add new TaskBinding nodes to the graph if necessary
            B.apply_chunk_operator(bg, global_registry.chunk_operators, global_registry.tasks, max_nchunks)
            # B.write_binding_graph_images(bg, job_resources.workflow)
            # If a TaskScatteredBindingNode is completed successfully and
            # output chunk.json is resolved, read in the file and
            # generate the new chunked tasks. This mutates the graph
            # significantly.
            B.add_gather_to_completed_task_chunks(bg, global_registry.chunk_operators, global_registry.tasks, job_resources.tasks)

            if not _are_workers_alive(workers):
                for tix_, w_ in workers.iteritems():
                    if not w_.is_alive():
                        log.warn("Worker {i} (pid {p}) is not alive for task {x}. Worker exit code {e}.".format(i=w_.name, p=w_.pid, e=w_.exitcode, x=tix_))
                        #w_.terminate()

            # Check if Any tasks are running or that there still runnable tasks
            is_completed = bg.is_workflow_complete()
            has_task_running = B.has_running_task(bg)

            if not has_task_running:
                if not is_completed:
                    if not B.has_task_in_states(bg, TaskStates.RUNNABLE_STATES()):
                        if not B.has_next_runnable_task(bg):
                            msg = "Unable to find runnable task or any tasks running and workflow is NOT completed."
                            log.error(msg)
                            log.error(BU.to_binding_graph_summary(bg))
                            services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, msg)
                            raise PipelineRuntimeError(msg)

            time.sleep(sleep_time)

            # log.debug("Sleeping for {s}".format(s=sleep_time))
            log.debug("\n" + BU.to_binding_graph_summary(bg))
            # BU.to_binding_graph_task_summary(bg)

            # This should only be triggered after events. The main reason
            # to keep updating it was the html report is up to date with the
            # runtime
            write_report_(bg, TaskStates.RUNNING, is_completed)

            if is_completed:
                msg_ = "Workflow is completed. breaking out."
                log.info(msg_)
                services_log_update_progress("pbsmrtpipe", WS.LogLevels.INFO, msg_)
                break

            try:
                result = q_out.get_nowait()
            except Queue.Empty:
                result = None

            # log.info("Results {r}".format(r=result))
            if isinstance(result, TaskResult):
                niterations = 0
                log.debug("Task result {r}".format(r=result))

                tid_, state_, msg_, run_time_ = result
                tnode_ = tid_to_tnode[tid_]
                task_ = tnode_to_task[tnode_]

                # Process Successful Task Result
                if state_ == TaskStates.SUCCESSFUL:
                    msg_ = "Task was successful {r}".format(r=result)
                    slog.info(msg_)

                    # this will raise if a task output is failed to be resolved
                    B.validate_outputs_and_update_task_to_success(bg, tnode_, run_time_, bg.node[tnode_]['task'].output_files)
                    slog.info("Successfully validated outputs of {t}".format(t=repr(tnode_)))

                    services_log_update_progress("pbsmrtpipe::{i}".format(i=tid_), WS.LogLevels.INFO, msg_)
                    B.update_task_output_file_nodes(bg, tnode_, tnode_to_task[tnode_])
                    B.resolve_successor_binding_file_path(bg)

                    total_nproc -= task_.nproc
                    w_ = workers.pop(tid_)
                    _terminate_worker(w_)

                    # Update Analysis Reports and Register output files to Datastore
                    _update_analysis_reports_and_datastore(tnode_, task_)

                    BU.write_binding_graph_images(bg, job_resources.workflow)
                else:
                    # Process Non-Successful Task Result
                    B.update_task_state(bg, tnode_, state_)
                    msg_ = "Task was not successful {r}".format(r=result)
                    slog.error(msg_)
                    log.error(msg_ + "\n")
                    sys.stderr.write(msg_ + "\n")

                    stderr_ = os.path.join(task_.output_dir, "stderr")

                    _log_task_failure_and_call_services(stderr_, tid_)

                    # let the remaining running jobs continue
                    w_ = workers.pop(tid_)
                    _terminate_worker(w_)

                    total_nproc -= task_.nproc
                    has_failed = True

                    BU.write_binding_graph_images(bg, job_resources.workflow)

                _update_msg = _status(bg)
                log.info(_update_msg)
                slog.info(_update_msg)

                s_ = TaskStates.FAILED if has_failed else TaskStates.RUNNING

                write_report_(bg, s_, False)
                write_task_summary_report(bg)

            elif isinstance(result, types.NoneType):
                pass
            else:
                log.error("Unexpected queue result type {t} {r}".format(t=type(result), r=result))

            if has_failed:
                log.error("job has failed. breaking out.")
                # Just kill everything
                break

            # Computational resources are tapped
            if len(workers) >= max_nworkers:
                # don't do anything
                continue

            tnode = B.get_next_runnable_task(bg)

            if tnode is None:
                continue
            elif isinstance(tnode, TaskBindingNode):
                niterations = 0
                # Found a Runnable Task

                # base task_id-instance_id
                tid = '-'.join([tnode.meta_task.task_id, str(tnode.instance_id)])

                task_dir = os.path.join(job_resources.tasks, tid)
                if not os.path.exists(task_dir):
                    os.mkdir(task_dir)

                to_resources_func = B.to_resolve_di_resources(task_dir, root_tmp_dir=workflow_opts.tmp_dir)
                input_files = B.get_task_input_files(bg, tnode)

                # convert metatask -> task
                try:
                    task = GX.meta_task_to_task(tnode.meta_task, input_files, task_opts, task_dir, max_nproc, max_nchunks,
                                                to_resources_func, to_resolve_files_func)
                except Exception as e:
                    slog.error("Failed to convert metatask {i} to task. {m}".format(i=tnode.meta_task.task_id, m=e.message))
                    raise

                # log.debug(task)

                bg.node[tnode]['nproc'] = task.nproc

                if not has_available_slots(task.nproc):
                    # not enough slots to run in
                    continue

                bg.node[tnode]['task'] = task
                tnode_to_task[tnode] = task

                if isinstance(tnode.meta_task, (ToolContractMetaTask, ScatterToolContractMetaTask, GatherToolContractMetaTask)):
                    # the task.options have actually already been resolved here, but using this other
                    # code path for clarity
                    if isinstance(tnode.meta_task, ToolContractMetaTask):
                        rtc = IO.static_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc)
                    elif isinstance(tnode.meta_task, ScatterToolContractMetaTask):
                        rtc = IO.static_scatter_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc, max_nchunks, tnode.meta_task.chunk_keys)
                    elif isinstance(tnode.meta_task, GatherToolContractMetaTask):
                        # this should always be a TaskGatherBindingNode which will have a .chunk_key
                        rtc = IO.static_gather_meta_task_to_rtc(tnode.meta_task, task, task_opts, task_dir, '/tmp', max_nproc, tnode.chunk_key)
                    else:
                        raise TypeError("Unsupported task type {t}".format(t=tnode.meta_task))

                    # write driver manifest, which calls the resolved-tool-contract.json
                    # there's too many layers of indirection here. Partly due to the pre-tool-contract era
                    # python defined tasks.
                    # Always write the RTC json for debugging purposes
                    tc_path = os.path.join(task_dir, GlobalConstants.TOOL_CONTRACT_JSON)
                    write_tool_contract(tnode.meta_task.tool_contract, tc_path)

                    rtc_json_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_JSON)
                    rtc_avro_path = os.path.join(task_dir, GlobalConstants.RESOLVED_TOOL_CONTRACT_AVRO)
                    if rtc.driver.serialization == 'avro':
                        # hack to fix command
                        task.cmds[0] = task.cmds[0].replace('.json', '.avro')
                        write_resolved_tool_contract_avro(rtc, rtc_avro_path)
                    # for debugging
                    write_resolved_tool_contract(rtc, rtc_json_path)

                runnable_task_path = os.path.join(task_dir, GlobalConstants.RUNNABLE_TASK_JSON)
                runnable_task = RunnableTask(task, global_registry.cluster_renderer)
                runnable_task.write_json(runnable_task_path)

                # Create an instance of Worker
                w = _to_worker(tnode.meta_task.is_distributed, "worker-task-{i}".format(i=tid), tid, runnable_task_path)

                workers[tid] = w
                w.start()
                total_nproc += task.nproc
                slog.info("Starting worker {i} ({n} workers running, {m} total proc in use)".format(i=tid, n=len(workers), m=total_nproc))

                # Submit job to be run.
                B.update_task_state(bg, tnode, TaskStates.SUBMITTED)
                msg_ = "Updating task {t} to SUBMITTED".format(t=tid)
                log.debug(msg_)
                tid_to_tnode[tid] = tnode
                services_log_update_progress("pbsmrtpipe::{i}".format(i=tnode.idx), WS.LogLevels.INFO, msg_)
                BU.write_binding_graph_images(bg, job_resources.workflow)

            elif isinstance(tnode, EntryOutBindingFileNode):
                # Handle EntryPoint types. This is not a particularly elegant design :(
                bg.node[tnode]['nproc'] = 1
                log.info("Marking task as completed {t}".format(t=tnode))
                B.update_task_state_to_success(bg, tnode, 0.0)
                # Update output paths
                mock_file_index = 0
                for fnode in bg.successors(tnode):
                    file_path = "/path/to/mock-file-{i}.txt".format(i=mock_file_index)
                    B.update_file_state_to_resolved(bg, fnode, file_path)
                    mock_file_index += 1
            else:
                raise TypeError("Unsupported node type {t} of '{x}'".format(t=type(tnode), x=tnode))

            # Update state of any files
            B.resolve_successor_binding_file_path(bg)
            # Final call to write empty analysis reports
            write_analysis_report(analysis_file_links)

        # end of while loop
        _terminate_all_workers(workers.values(), shutdown_event)

        if has_failed:
            log.debug("\n" + BU.to_binding_graph_summary(bg))

        was_successful = B.was_workflow_successful(bg)
        s_ = TaskStates.SUCCESSFUL if was_successful else TaskStates.FAILED
        write_report_(bg, s_, was_successful)

    except PipelineRuntimeKeyboardInterrupt:
        write_report_(bg, TaskStates.KILLED, False)
        write_task_summary_report(bg)
        BU.write_binding_graph_images(bg, job_resources.workflow)
        was_successful = False

    except Exception as e:
        slog.error("Unexpected error {e}. Writing reports and shutting down".format(e=e.message))
        # update workflow reports to failed
        write_report_(bg, TaskStates.FAILED, False)
        write_task_summary_report(bg)
        services_log_update_progress("pbsmrtpipe", WS.LogLevels.ERROR, "Error {e}".format(e=e))
        BU.write_binding_graph_images(bg, job_resources.workflow)
        raise

    finally:
        write_task_summary_report(bg)
        BU.write_binding_graph_images(bg, job_resources.workflow)

    return True if was_successful else False
Ejemplo n.º 6
0
 def test_is_validate_binding_graph(self):
     self.assertTrue(B.validate_binding_graph_integrity(self.bgraph))
Ejemplo n.º 7
0
 def test_validate_bindings_graph(self):
     emsg = "Invalid workflow with id '{x}'".format(x=self.PB_PIPELINE_ID)
     self.assertTrue(B.validate_binding_graph_integrity(self.bgraph), emsg)