Example #1
0
def _process_stop_initiated_pipelines(
    mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue,
    pipeline_details: Sequence[_PipelineDetail]) -> None:
  """Processes stop initiated pipelines."""
  for detail in pipeline_details:
    pipeline = detail.pipeline_state.pipeline
    execution = detail.pipeline_state.execution
    has_active_executions = False
    for node in _get_all_pipeline_nodes(pipeline):
      # If the node has an ExecNodeTask in the task queue, issue a cancellation.
      # Otherwise, if the node has an active execution in MLMD but no
      # ExecNodeTask enqueued, it may be due to orchestrator restart after
      # pipeline stop was initiated but before the schedulers could finish. So,
      # enqueue an ExecNodeTask with is_cancelled set to give a chance for the
      # scheduler to finish gracefully.
      exec_node_task_id = task_lib.exec_node_task_id_from_pipeline_node(
          pipeline, node)
      if task_queue.contains_task_id(exec_node_task_id):
        task_queue.enqueue(
            task_lib.CancelNodeTask(
                node_uid=task_lib.NodeUid.from_pipeline_node(pipeline, node)))
        has_active_executions = True
      else:
        executions = task_gen_utils.get_executions(mlmd_handle, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            mlmd_handle, pipeline, node, executions, is_cancelled=True)
        if exec_node_task:
          task_queue.enqueue(exec_node_task)
          has_active_executions = True
    if not has_active_executions:
      updated_execution = copy.deepcopy(execution)
      updated_execution.last_known_state = metadata_store_pb2.Execution.CANCELED
      mlmd_handle.store.put_executions([updated_execution])
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for all executable nodes in the async pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if no
    nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        result = []
        filtered_nodes = [
            n.pipeline_node for n in self._pipeline.nodes
            if n.pipeline_node.node_info.id not in self._ignore_node_ids
        ]
        for node in filtered_nodes:
            # If a task for the node is already tracked by the task queue, it need
            # not be considered for generation again.
            if self._is_task_id_tracked_fn(
                    task_lib.exec_node_task_id_from_pipeline_node(
                        self._pipeline, node)):
                continue
            task = self._generate_task(self._mlmd_handle, node)
            if task:
                result.append(task)
        return result
Example #3
0
    def __call__(self) -> List[task_lib.Task]:
        result = []
        for node in [n.pipeline_node for n in self._pipeline.nodes]:
            node_uid = task_lib.NodeUid.from_pipeline_node(
                self._pipeline, node)
            node_id = node.node_info.id

            with self._pipeline_state:
                node_state = self._pipeline_state.get_node_state(node_uid)
                if node_state.state in (pstate.NodeState.STOPPING,
                                        pstate.NodeState.STOPPED):
                    logging.info(
                        'Ignoring node in state \'%s\' for task generation: %s',
                        node_state.state, node_uid)
                    continue

            # If this is a pure service node, there is no ExecNodeTask to generate
            # but we ensure node services and check service status.
            service_status = self._ensure_node_services_if_pure(node_id)
            if service_status is not None:
                if service_status != service_jobs.ServiceStatus.RUNNING:
                    error_msg = f'associated service job failed; node uid: {node_uid}'
                    result.append(
                        task_lib.UpdateNodeStateTask(
                            node_uid=node_uid,
                            state=pstate.NodeState.FAILED,
                            status=status_lib.Status(
                                code=status_lib.Code.ABORTED,
                                message=error_msg)))
                else:
                    result.append(
                        task_lib.UpdateNodeStateTask(
                            node_uid=node_uid, state=pstate.NodeState.RUNNING))
                continue

            # If a task for the node is already tracked by the task queue, it need
            # not be considered for generation again but we ensure node services
            # in case of a mixed service node.
            if self._is_task_id_tracked_fn(
                    task_lib.exec_node_task_id_from_pipeline_node(
                        self._pipeline, node)):
                service_status = self._ensure_node_services_if_mixed(node_id)
                if service_status is not None:
                    if service_status != service_jobs.ServiceStatus.RUNNING:
                        error_msg = f'associated service job failed; node uid: {node_uid}'
                        result.append(
                            task_lib.UpdateNodeStateTask(
                                node_uid=node_uid,
                                state=pstate.NodeState.FAILED,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=error_msg)))
                continue

            result.extend(
                self._generate_tasks_for_node(self._mlmd_handle, node))
        return result
Example #4
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for nodes in layers:
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            executed_nodes = False
            for node in nodes:
                if node.node_info.id in self._ignore_node_ids:
                    logging.info(
                        'Ignoring node for task generation: %s',
                        task_lib.NodeUid.from_pipeline_node(
                            self._pipeline, node))
                    continue
                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    executed_nodes = True
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task:
                        result.append(task)
            # If there are no executed nodes in the current layer, downstream nodes
            # need not be checked.
            if not executed_nodes:
                break
        return result
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for all executable nodes in the async pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if no
    nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        result = []
        for node in [n.pipeline_node for n in self._pipeline.nodes]:
            node_uid = task_lib.NodeUid.from_pipeline_node(
                self._pipeline, node)
            node_id = node.node_info.id
            if node_id in self._ignore_node_ids:
                logging.info('Ignoring node for task generation: %s', node_uid)
                continue

            if self._service_job_manager.is_pure_service_node(
                    self._pipeline_state, node_id):
                service_status = self._service_job_manager.ensure_node_services(
                    self._pipeline_state, node_id)
                if service_status != service_jobs.ServiceStatus.RUNNING:
                    logging.error(
                        'Required service node not running or healthy, node uid: %s',
                        node_uid)
                    result.append(
                        task_lib.FinalizeNodeTask(
                            node_uid=node_uid,
                            status=status_lib.Status(
                                code=status_lib.Code.ABORTED,
                                message=
                                (f'Aborting node execution as the associated service '
                                 f'job is not running or healthy; problematic node '
                                 f'uid: {node_uid}'))))
                continue

            # If a task for the node is already tracked by the task queue, it need
            # not be considered for generation again.
            if self._is_task_id_tracked_fn(
                    task_lib.exec_node_task_id_from_pipeline_node(
                        self._pipeline, node)):
                continue
            task = self._generate_task(self._mlmd_handle, node)
            if task:
                result.append(task)
        return result
Example #6
0
def _maybe_enqueue_cancellation_task(mlmd_handle: metadata.Metadata,
                                     pipeline: pipeline_pb2.Pipeline,
                                     node: pipeline_pb2.PipelineNode,
                                     task_queue: tq.TaskQueue,
                                     pause: bool = False) -> bool:
    """Enqueues a node cancellation task if not already stopped.

  If the node has an ExecNodeTask in the task queue, issue a cancellation.
  Otherwise, when pause=False, if the node has an active execution in MLMD but
  no ExecNodeTask enqueued, it may be due to orchestrator restart after stopping
  was initiated but before the schedulers could finish. So, enqueue an
  ExecNodeTask with is_cancelled set to give a chance for the scheduler to
  finish gracefully.

  Args:
    mlmd_handle: A handle to the MLMD db.
    pipeline: The pipeline containing the node to cancel.
    node: The node to cancel.
    task_queue: A `TaskQueue` instance into which any cancellation tasks will be
      enqueued.
    pause: Whether the cancellation is to pause the node rather than cancelling
      the execution.

  Returns:
    `True` if a cancellation task was enqueued. `False` if node is already
    stopped or no cancellation was required.
  """
    exec_node_task_id = task_lib.exec_node_task_id_from_pipeline_node(
        pipeline, node)
    if task_queue.contains_task_id(exec_node_task_id):
        task_queue.enqueue(
            task_lib.CancelNodeTask(
                node_uid=task_lib.NodeUid.from_pipeline_node(pipeline, node),
                pause=pause))
        return True
    if not pause:
        executions = task_gen_utils.get_executions(mlmd_handle, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            mlmd_handle, pipeline, node, executions, is_cancelled=True)
        if exec_node_task:
            task_queue.enqueue(exec_node_task)
            return True
    return False
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for all executable nodes in the async pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if no
    nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        result = []
        for node in [n.pipeline_node for n in self._pipeline.nodes]:
            node_uid = task_lib.NodeUid.from_pipeline_node(
                self._pipeline, node)
            node_id = node.node_info.id
            if node_id in self._ignore_node_ids:
                logging.info('Ignoring node for task generation: %s', node_uid)
                continue

            # If this is a pure service node, there is no ExecNodeTask to generate
            # but we ensure node services and check service status.
            service_status = self._ensure_node_services_if_pure(node_id)
            if service_status is not None:
                if service_status != service_jobs.ServiceStatus.RUNNING:
                    result.append(self._abort_node_task(node_uid))
                continue

            # If a task for the node is already tracked by the task queue, it need
            # not be considered for generation again but we ensure node services
            # in case of a mixed service node.
            if self._is_task_id_tracked_fn(
                    task_lib.exec_node_task_id_from_pipeline_node(
                        self._pipeline, node)):
                service_status = self._ensure_node_services_if_mixed(node_id)
                if service_status is not None:
                    if service_status != service_jobs.ServiceStatus.RUNNING:
                        result.append(self._abort_node_task(node_uid))
                continue
            task = self._generate_task(self._mlmd_handle, node)
            if task:
                result.append(task)
        return result
Example #8
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for all executable nodes in the async pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if no
    nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        result = []
        # TODO(b/170231077): Reuse connection instead of reconnecting as the latter
        # is expensive.
        with self._mlmd_connection as m:
            for node in [node.pipeline_node for node in self._pipeline.nodes]:
                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                task = self._generate_task(m, node)
                if task:
                    result.append(task)
        return result
Example #9
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for layer_num, nodes in enumerate(layers):
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            completed_node_ids = set()
            for node in nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id
                if self._service_job_manager.is_pure_service_node(
                        self._pipeline_state, node.node_info.id):
                    if not self._upstream_nodes_executed(node):
                        continue
                    service_status = self._service_job_manager.ensure_node_services(
                        self._pipeline_state, node_id)
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node completed successfully: %s',
                                     node_uid)
                        completed_node_ids.add(node_id)
                    elif service_status == service_jobs.ServiceStatus.FAILED:
                        logging.error('Failed service node: %s', node_uid)
                        return [
                            task_lib.FinalizePipelineTask(
                                pipeline_uid=self._pipeline_state.pipeline_uid,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=
                                    (f'Aborting pipeline execution due to service '
                                     f'node failure; failed node uid: {node_uid}'
                                     )))
                        ]
                    else:
                        logging.info('Pure service node in progress: %s',
                                     node_uid)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    completed_node_ids.add(node_id)
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)
            # If there are no completed nodes in the current layer, downstream nodes
            # need not be checked.
            if not completed_node_ids:
                break
            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if layer_num == len(layers) - 1 and completed_node_ids == set(
                    node.node_info.id for node in nodes):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_state.pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
Example #10
0
    def _generate_tasks_for_node(
            self, node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]:
        """Generates list of tasks for the given node."""
        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        node_id = node.node_info.id
        result = []

        node_state = self._node_states_dict[node_uid]
        if node_state.state in (pstate.NodeState.STOPPING,
                                pstate.NodeState.STOPPED):
            logging.info(
                'Ignoring node in state \'%s\' for task generation: %s',
                node_state.state, node_uid)
            return result

        # If this is a pure service node, there is no ExecNodeTask to generate
        # but we ensure node services and check service status.
        service_status = self._ensure_node_services_if_pure(node_id)
        if service_status is not None:
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            elif service_status == service_jobs.ServiceStatus.SUCCESS:
                logging.info('Service node successful: %s', node_uid)
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.COMPLETE))
            elif service_status == service_jobs.ServiceStatus.RUNNING:
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.RUNNING))
            return result

        # If a task for the node is already tracked by the task queue, it need
        # not be considered for generation again but we ensure node services
        # in case of a mixed service node.
        if self._is_task_id_tracked_fn(
                task_lib.exec_node_task_id_from_pipeline_node(
                    self._pipeline, node)):
            service_status = self._ensure_node_services_if_mixed(node_id)
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'associated service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        node_executions = task_gen_utils.get_executions(
            self._mlmd_handle, node)
        latest_execution = task_gen_utils.get_latest_execution(node_executions)

        # If the latest execution is successful, we're done.
        if latest_execution and execution_lib.is_execution_successful(
                latest_execution):
            logging.info('Node successful: %s', node_uid)
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.COMPLETE))
            return result

        # If the latest execution failed or cancelled, the pipeline should be
        # aborted if the node is not in state STARTING. For nodes that are
        # in state STARTING, a new execution is created.
        if (latest_execution
                and not execution_lib.is_execution_active(latest_execution)
                and node_state.state != pstate.NodeState.STARTING):
            error_msg_value = latest_execution.custom_properties.get(
                constants.EXECUTION_ERROR_MSG_KEY)
            error_msg = data_types_utils.get_metadata_value(
                error_msg_value) if error_msg_value else ''
            error_msg = f'node failed; node uid: {node_uid}; error: {error_msg}'
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.FAILED,
                                             status=status_lib.Status(
                                                 code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            self._mlmd_handle, self._pipeline, node, node_executions)
        if exec_node_task:
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.RUNNING))
            result.append(exec_node_task)
            return result

        # Finally, we are ready to generate tasks for the node by resolving inputs.
        result.extend(self._resolve_inputs_and_generate_tasks_for_node(node))
        return result
Example #11
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result