Esempio n. 1
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id
            )
        )
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except Exception:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning(
                    "Failed to get the output of step "
                    f"{workflow_ref.step_id}. Trying to resume it. "
                    f"Current step: '{current_step_id}'"
                )
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id, storage_url, None
                ).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Esempio n. 2
0
    def run_or_resume(
            self,
            job_id: str,
            workflow_id: str,
            ignore_existing: bool = False) -> "WorkflowExecutionResult":
        """Run or resume a workflow.

        Args:
            job_id: The ID of the job that submits the workflow execution.
            workflow_id: The ID of the workflow.
            ignore_existing: Ignore we already have an existing output. When
            set false, raise an exception if there has already been a workflow
            running with this id

        Returns:
            Workflow execution result that contains the state and output.
        """
        if workflow_id in self._workflow_outputs and not ignore_existing:
            raise RuntimeError(
                f"The output of workflow[id={workflow_id}] already exists.")
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        workflow_prerun_metadata = {"start_time": time.time()}
        wf_store.save_workflow_prerun_metadata(workflow_prerun_metadata)
        step_id = wf_store.get_entrypoint_step_id()
        try:
            current_output = self._workflow_outputs[workflow_id].output
        except KeyError:
            current_output = None
        result = recovery.resume_workflow_step(job_id, workflow_id, step_id,
                                               current_output)
        latest_output = LatestWorkflowOutput(result.persisted_output,
                                             workflow_id, step_id)
        self._workflow_outputs[workflow_id] = latest_output
        logger.info(f"run_or_resume: {workflow_id}, {step_id},"
                    f"{result.persisted_output.ref}")
        self._step_output_cache[(workflow_id, step_id)] = latest_output

        self._update_workflow_status(workflow_id,
                                     common.WorkflowStatus.RUNNING)

        if workflow_id not in self._step_status:
            self._step_status[workflow_id] = {}
            logger.info(f"Workflow job [id={workflow_id}] started.")
        return result
Esempio n. 3
0
    def run_or_resume(
            self,
            workflow_id: str,
            ignore_existing: bool = False) -> "WorkflowExecutionResult":
        """Run or resume a workflow.

        Args:
            workflow_id: The ID of the workflow.
            ignore_existing: Ignore we already have an existing output. When
            set false, raise an exception if there has already been a workflow
            running with this id

        Returns:
            Workflow execution result that contains the state and output.
        """
        if workflow_id in self._workflow_outputs and not ignore_existing:
            raise RuntimeError(f"The output of workflow[id={workflow_id}] "
                               "already exists.")
        wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store)
        step_id = wf_store.get_entrypoint_step_id()
        try:
            current_output = self._workflow_outputs[workflow_id].output
        except KeyError:
            current_output = None
        result = recovery.resume_workflow_step(workflow_id, step_id,
                                               self._store.storage_url,
                                               current_output)
        latest_output = LatestWorkflowOutput(result.persisted_output,
                                             workflow_id, step_id)
        self._workflow_outputs[workflow_id] = latest_output
        print("run_or_resume: ", workflow_id, step_id, result.persisted_output)
        self._step_output_cache[(workflow_id, step_id)] = latest_output

        wf_store.save_workflow_meta(
            common.WorkflowMetaData(common.WorkflowStatus.RUNNING))

        if workflow_id not in self._step_status:
            self._step_status[workflow_id] = {}
            logger.info(f"Workflow job [id={workflow_id}] started.")
        return result