Ejemplo n.º 1
0
def resume_all(with_failed: bool) -> List[Tuple[str, ray.ObjectRef]]:
    filter_set = {WorkflowStatus.RESUMABLE}
    if with_failed:
        filter_set.add(WorkflowStatus.FAILED)
    all_failed = list_all(filter_set)

    try:
        workflow_manager = get_management_actor()
    except Exception as e:
        raise RuntimeError("Failed to get management actor") from e

    job_id = ray.get_runtime_context().job_id.hex()
    reconstructed_refs = []
    reconstructed_workflows = []
    for wid, _ in all_failed:
        context = workflow_context.WorkflowStepContext(workflow_id=wid)
        reconstructed_refs.append(
            (context,
             workflow_manager.reconstruct_workflow.remote(job_id, context)))
    for context, ref in reconstructed_refs:
        try:
            ray.get(ref)  # make sure the workflow is already reconstructed
            reconstructed_workflows.append((
                context.workflow_id,
                workflow_manager.execute_workflow.remote(job_id, context),
            ))
        except Exception:
            # TODO(suquark): Here some workflows got resumed successfully but some
            #  failed and the user has no idea about this, which is very wired.
            # Maybe we should raise an exception here instead?
            logger.error(f"Failed to resume workflow {context.workflow_id}")

    return reconstructed_workflows
Ejemplo n.º 2
0
def resume(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously. See "api.resume()" for details."""
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = get_or_create_management_actor()
    if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)):
        raise RuntimeError(f"Workflow '{workflow_id}' is already running.")
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context))
    result = workflow_manager.execute_workflow.remote(job_id, context)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return result
Ejemplo n.º 3
0
def run(
    dag: DAGNode,
    dag_inputs: DAGInputData,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict] = None,
) -> ray.ObjectRef:
    """Run a workflow asynchronously."""
    validate_user_metadata(metadata)
    metadata = metadata or {}

    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, dag_inputs, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = workflow_storage.get_workflow_storage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = get_or_create_management_actor()
        if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)):
            raise RuntimeError(f"Workflow '{workflow_id}' is already running.")
        if wf_exists:
            return resume(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)
Ejemplo n.º 4
0
def resume_async(workflow_id: str) -> ray.ObjectRef:
    """Resume a workflow asynchronously.

    Resume a workflow and retrieve its output. If the workflow was incomplete,
    it will be re-executed from its checkpointed outputs. If the workflow was
    complete, returns the result immediately.

    Examples:
        >>> from ray import workflow
        >>> start_trip = ... # doctest: +SKIP
        >>> trip = start_trip.step() # doctest: +SKIP
        >>> res1 = trip.run_async(workflow_id="trip1") # doctest: +SKIP
        >>> res2 = workflow.resume("trip1") # doctest: +SKIP
        >>> assert ray.get(res1) == ray.get(res2) # doctest: +SKIP

    Args:
        workflow_id: The id of the workflow to resume.

    Returns:
        An object reference that can be used to retrieve the workflow result.
    """
    _ensure_workflow_initialized()
    logger.info(f'Resuming workflow [id="{workflow_id}"].')
    workflow_manager = workflow_access.get_management_actor()
    if ray.get(
            workflow_manager.is_workflow_non_terminating.remote(workflow_id)):
        raise RuntimeError(
            f"Workflow '{workflow_id}' is already running or pending.")
    # NOTE: It is important to 'ray.get' the returned output. This
    # ensures caller of 'run()' holds the reference to the workflow
    # result. Otherwise if the actor removes the reference of the
    # workflow output, the caller may fail to resolve the result.
    job_id = ray.get_runtime_context().job_id.hex()

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context))
    result = workflow_manager.execute_workflow.remote(job_id, context)
    logger.info(f"Workflow job {workflow_id} resumed.")
    return result
Ejemplo n.º 5
0
def resume_all(
        include_failed: bool = False) -> List[Tuple[str, ray.ObjectRef]]:
    """Resume all resumable workflow jobs.

    This can be used after cluster restart to resume all tasks.

    Args:
        include_failed: Whether to resume FAILED workflows.

    Examples:
        >>> from ray import workflow
        >>> failed_job = ... # doctest: +SKIP
        >>> workflow_step = failed_job.step() # doctest: +SKIP
        >>> output = workflow_step.run_async(workflow_id="failed_job") # doctest: +SKIP
        >>> try: # doctest: +SKIP
        >>>     ray.get(output) # doctest: +SKIP
        >>> except Exception: # doctest: +SKIP
        >>>     print("JobFailed") # doctest: +SKIP
        >>> jobs = workflow.list_all() # doctest: +SKIP
        >>> assert jobs == [("failed_job", workflow.FAILED)] # doctest: +SKIP
        >>> assert workflow.resume_all( # doctest: +SKIP
        ...    include_failed=True).get("failed_job") is not None # doctest: +SKIP

    Returns:
        A list of (workflow_id, returned_obj_ref) resumed.
    """
    _ensure_workflow_initialized()
    filter_set = {WorkflowStatus.RESUMABLE}
    if include_failed:
        filter_set.add(WorkflowStatus.FAILED)
    all_failed = list_all(filter_set)

    try:
        workflow_manager = workflow_access.get_management_actor()
    except Exception as e:
        raise RuntimeError("Failed to get management actor") from e

    job_id = ray.get_runtime_context().job_id.hex()
    reconstructed_workflows = []
    for wid, _ in all_failed:
        context = workflow_context.WorkflowStepContext(workflow_id=wid)
        # TODO(suquark): This is not very efficient, but it makes sure
        #  running workflows has higher priority when getting reconstructed.
        try:
            ray.get(
                workflow_manager.reconstruct_workflow.remote(job_id, context))
        except Exception as e:
            # TODO(suquark): Here some workflows got resumed successfully but some
            #  failed and the user has no idea about this, which is very wired.
            # Maybe we should raise an exception here instead?
            logger.error(f"Failed to resume workflow {context.workflow_id}",
                         exc_info=e)
            raise
        reconstructed_workflows.append(context)

    results = []
    for context in reconstructed_workflows:
        results.append((
            context.workflow_id,
            workflow_manager.execute_workflow.remote(job_id, context),
        ))
    return results
Ejemplo n.º 6
0
def run_async(
    dag: DAGNode,
    *args,
    workflow_id: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    **kwargs,
) -> ray.ObjectRef:
    """Run a workflow asynchronously.

    If the workflow with the given id already exists, it will be resumed.

    Args:
        workflow_id: A unique identifier that can be used to resume the
            workflow. If not specified, a random id will be generated.
        metadata: The metadata to add to the workflow. It has to be able
            to serialize to json.

    Returns:
       The running result as ray.ObjectRef.

    """
    _ensure_workflow_initialized()
    if not isinstance(dag, DAGNode):
        raise TypeError("Input should be a DAG.")
    input_data = DAGInputData(*args, **kwargs)
    validate_user_metadata(metadata)
    metadata = metadata or {}

    if workflow_id is None:
        # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds}
        workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}"

    state = workflow_state_from_dag(dag, input_data, workflow_id)
    logger.info(f'Workflow job created. [id="{workflow_id}"].')

    context = workflow_context.WorkflowStepContext(workflow_id=workflow_id)
    with workflow_context.workflow_step_context(context):
        # checkpoint the workflow
        ws = WorkflowStorage(workflow_id)
        ws.save_workflow_user_metadata(metadata)

        job_id = ray.get_runtime_context().job_id.hex()

        try:
            ws.get_entrypoint_step_id()
            wf_exists = True
        except Exception:
            # The workflow does not exist. We must checkpoint entry workflow.
            ws.save_workflow_execution_state("", state)
            wf_exists = False
        workflow_manager = workflow_access.get_management_actor()
        if ray.get(
                workflow_manager.is_workflow_non_terminating.remote(
                    workflow_id)):
            raise RuntimeError(
                f"Workflow '{workflow_id}' is already running or pending.")
        if wf_exists:
            return resume_async(workflow_id)
        ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE
        ray.get(
            workflow_manager.submit_workflow.remote(
                workflow_id, state, ignore_existing=ignore_existing))
        return workflow_manager.execute_workflow.remote(job_id, context)