Exemple #1
0
    def test_generate_task_from_active_execution(self):
        with self._mlmd_connection as m:
            # No tasks generated without active execution.
            executions = task_gen_utils.get_executions(m, self._trainer)
            self.assertIsNone(
                task_gen_utils.generate_task_from_active_execution(
                    m, self._pipeline, self._trainer, executions))

        # Next, ensure an active execution for trainer.
        otu.fake_trainer_output(self._mlmd_connection, self._trainer)
        with self._mlmd_connection as m:
            execution = m.store.get_executions()[0]
            execution.last_known_state = metadata_store_pb2.Execution.RUNNING
            m.store.put_executions([execution])

            # Check that task can be generated.
            executions = task_gen_utils.get_executions(m, self._trainer)
            task = task_gen_utils.generate_task_from_active_execution(
                m, self._pipeline, self._trainer, executions)
            self.assertEqual(execution.id, task.execution.id)

            # Mark execution complete. No tasks should be generated.
            execution = m.store.get_executions()[0]
            execution.last_known_state = metadata_store_pb2.Execution.COMPLETE
            m.store.put_executions([execution])
            executions = task_gen_utils.get_executions(m, self._trainer)
            self.assertIsNone(
                task_gen_utils.generate_task_from_active_execution(
                    m, self._pipeline, self._trainer, executions))
Exemple #2
0
def _process_stop_initiated_pipelines(
    mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue,
    pipeline_details: Sequence[_PipelineDetail]) -> None:
  """Processes stop initiated pipelines."""
  for detail in pipeline_details:
    pipeline = detail.pipeline_state.pipeline
    execution = detail.pipeline_state.execution
    has_active_executions = False
    for node in _get_all_pipeline_nodes(pipeline):
      # If the node has an ExecNodeTask in the task queue, issue a cancellation.
      # Otherwise, if the node has an active execution in MLMD but no
      # ExecNodeTask enqueued, it may be due to orchestrator restart after
      # pipeline stop was initiated but before the schedulers could finish. So,
      # enqueue an ExecNodeTask with is_cancelled set to give a chance for the
      # scheduler to finish gracefully.
      exec_node_task_id = task_lib.exec_node_task_id_from_pipeline_node(
          pipeline, node)
      if task_queue.contains_task_id(exec_node_task_id):
        task_queue.enqueue(
            task_lib.CancelNodeTask(
                node_uid=task_lib.NodeUid.from_pipeline_node(pipeline, node)))
        has_active_executions = True
      else:
        executions = task_gen_utils.get_executions(mlmd_handle, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            mlmd_handle, pipeline, node, executions, is_cancelled=True)
        if exec_node_task:
          task_queue.enqueue(exec_node_task)
          has_active_executions = True
    if not has_active_executions:
      updated_execution = copy.deepcopy(execution)
      updated_execution.last_known_state = metadata_store_pb2.Execution.CANCELED
      mlmd_handle.store.put_executions([updated_execution])
Exemple #3
0
  def test_stop_node_wait_for_inactivation(self):
    pipeline = pipeline_pb2.Pipeline()
    self.load_proto_from_text(
        os.path.join(
            os.path.dirname(__file__), 'testdata', 'async_pipeline.pbtxt'),
        pipeline)
    trainer = pipeline.nodes[2].pipeline_node
    test_utils.fake_component_output(
        self._mlmd_connection, trainer, active=True)
    pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline)
    node_uid = task_lib.NodeUid(node_id='my_trainer', pipeline_uid=pipeline_uid)
    with self._mlmd_connection as m:
      pstate.PipelineState.new(m, pipeline).commit()

      def _inactivate(execution):
        time.sleep(2.0)
        with pipeline_ops._PIPELINE_OPS_LOCK:
          execution.last_known_state = metadata_store_pb2.Execution.COMPLETE
          m.store.put_executions([execution])

      execution = task_gen_utils.get_executions(m, trainer)[0]
      thread = threading.Thread(
          target=_inactivate, args=(copy.deepcopy(execution),))
      thread.start()
      pipeline_ops.stop_node(m, node_uid, timeout_secs=5.0)
      thread.join()

      pipeline_state = pstate.PipelineState.load(m, pipeline_uid)
      self.assertEqual(status_lib.Code.CANCELLED,
                       pipeline_state.node_stop_initiated_reason(node_uid).code)

      # Restart node.
      pipeline_state = pipeline_ops.initiate_node_start(m, node_uid)
      self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))
Exemple #4
0
def get_all_node_executions(
    pipeline: pipeline_pb2.Pipeline, mlmd_handle: metadata.Metadata
) -> Dict[str, List[metadata_store_pb2.Execution]]:
    """Returns all executions of all pipeline nodes if present."""
    return {
        node.node_info.id: task_gen_utils.get_executions(mlmd_handle, node)
        for node in get_all_pipeline_nodes(pipeline)
    }
Exemple #5
0
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_pb2.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        if not task_gen_utils.is_feasible_node(node):
            return None

        executions = task_gen_utils.get_executions(metadata_handler, node)
        result = task_gen_utils.generate_task_from_active_execution(
            self._pipeline, node, executions)
        if result:
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None:
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        # If the latest successful execution had the same resolved input artifacts,
        # the component should not be triggered, so task is not generated.
        # TODO(b/170231077): This logic should be handled by the resolver when it's
        # implemented. Also, currently only the artifact ids of previous execution
        # are checked to decide if a new execution is warranted but it may also be
        # necessary to factor in the difference of execution properties.
        latest_exec = task_gen_utils.get_latest_successful_execution(
            executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(
                    *resolved_info.input_artifacts.values()))
            if latest_exec_input_artifact_ids == current_exec_input_artifact_ids:
                return None

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        return task_gen_utils.create_task(self._pipeline, node, execution)
  def test_cached_execution(self):
    """Tests that cached execution is used if one is available."""

    # Fake ExampleGen run.
    example_gen_exec = otu.fake_example_gen_run(self._mlmd_connection,
                                                self._example_gen, 1, 1)

    # Invoking generator should produce an ExecNodeTask for StatsGen.
    [stats_gen_task] = self._generate_and_test(
        False,
        num_initial_executions=1,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1)
    self.assertEqual('my_statistics_gen', stats_gen_task.node_uid.node_id)

    # Finish StatsGen execution.
    otu.fake_execute_node(self._mlmd_connection, stats_gen_task)

    # Prepare another pipeline with a new pipeline_run_id.
    pipeline_run_id = str(uuid.uuid4())
    new_pipeline = self._make_pipeline(self._pipeline_root, pipeline_run_id)

    with self._mlmd_connection as m:
      contexts = m.store.get_contexts_by_execution(example_gen_exec.id)
      # We use node context as cache context for ease of testing.
      cache_context = [c for c in contexts if c.name == 'my_example_gen'][0]
    # Fake example_gen cached execution.
    otu.fake_cached_execution(self._mlmd_connection, cache_context,
                              otu.get_node(new_pipeline, 'my_example_gen'))

    stats_gen = otu.get_node(new_pipeline, 'my_statistics_gen')

    # Invoking generator for the new pipeline should result in:
    # 1. StatsGen execution succeeds with state "CACHED" but no ExecNodeTask
    #    generated.
    # 2. An ExecNodeTask is generated for SchemaGen (component downstream of
    #    StatsGen) with an active execution in MLMD.
    [schema_gen_task] = self._generate_and_test(
        False,
        pipeline=new_pipeline,
        num_initial_executions=3,
        num_tasks_generated=1,
        num_new_executions=2,
        num_active_executions=1)
    self.assertEqual('my_schema_gen', schema_gen_task.node_uid.node_id)

    # Check that StatsGen execution is successful in state "CACHED".
    with self._mlmd_connection as m:
      executions = task_gen_utils.get_executions(m, stats_gen)
      self.assertLen(executions, 1)
      execution = executions[0]
      self.assertTrue(execution_lib.is_execution_successful(execution))
      self.assertEqual(metadata_store_pb2.Execution.CACHED,
                       execution.last_known_state)
Exemple #7
0
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_lib.Task]:
        """Generates a node execution task.

    If node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        if not task_gen_utils.is_feasible_node(node):
            return None

        executions = task_gen_utils.get_executions(metadata_handler, node)
        result = task_gen_utils.generate_task_from_active_execution(
            metadata_handler, self._pipeline, node, executions)
        if result:
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None:
            # TODO(goutham): If the pipeline can't make progress, there should be a
            # standard mechanism to surface it to the user.
            logging.warning(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)
        return task_lib.ExecNodeTask(
            node_uid=task_lib.NodeUid.from_pipeline_node(self._pipeline, node),
            execution=execution,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties,
            output_artifacts=outputs_resolver.generate_output_artifacts(
                execution.id),
            executor_output_uri=outputs_resolver.get_executor_output_uri(
                execution.id),
            stateful_working_dir=outputs_resolver.
            get_stateful_working_directory(execution.id))
Exemple #8
0
def stop_node(
        mlmd_handle: metadata.Metadata,
        node_uid: task_lib.NodeUid,
        timeout_secs: float = DEFAULT_WAIT_FOR_INACTIVATION_TIMEOUT_SECS
) -> None:
    """Stops a node in an async pipeline.

  Initiates a node stop operation and waits for the node execution to become
  inactive.

  Args:
    mlmd_handle: A handle to the MLMD db.
    node_uid: Uid of the node to be stopped.
    timeout_secs: Amount of time in seconds to wait for node to stop.

  Raises:
    status_lib.StatusNotOkError: Failure to stop the node.
  """
    with _PIPELINE_OPS_LOCK:
        with pstate.PipelineState.load(
                mlmd_handle, node_uid.pipeline_uid) as pipeline_state:
            nodes = pstate.get_all_pipeline_nodes(pipeline_state.pipeline)
            filtered_nodes = [
                n for n in nodes if n.node_info.id == node_uid.node_id
            ]
            if len(filtered_nodes) != 1:
                raise status_lib.StatusNotOkError(
                    code=status_lib.Code.INTERNAL,
                    message=
                    (f'`stop_node` operation failed, unable to find node to stop: '
                     f'{node_uid}'))
            node = filtered_nodes[0]
            pipeline_state.initiate_node_stop(
                node_uid,
                status_lib.Status(code=status_lib.Code.CANCELLED,
                                  message='Cancellation requested by client.'))

        executions = task_gen_utils.get_executions(mlmd_handle, node)
        active_executions = [
            e for e in executions if execution_lib.is_execution_active(e)
        ]
        if not active_executions:
            # If there are no active executions, we're done.
            return
        if len(active_executions) > 1:
            raise status_lib.StatusNotOkError(
                code=status_lib.Code.INTERNAL,
                message=
                (f'Unexpected multiple active executions for node: {node_uid}'
                 ))
    _wait_for_inactivation(mlmd_handle,
                           active_executions[0],
                           timeout_secs=timeout_secs)
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for nodes in layers:
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            executed_nodes = False
            for node in nodes:
                if node.node_info.id in self._ignore_node_ids:
                    logging.info(
                        'Ignoring node for task generation: %s',
                        task_lib.NodeUid.from_pipeline_node(
                            self._pipeline, node))
                    continue
                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    executed_nodes = True
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task:
                        result.append(task)
            # If there are no executed nodes in the current layer, downstream nodes
            # need not be checked.
            if not executed_nodes:
                break
        return result
Exemple #10
0
def resume_manual_node(mlmd_handle: metadata.Metadata,
                       node_uid: task_lib.NodeUid) -> None:
    """Resumes a manual node.

  Args:
    mlmd_handle: A handle to the MLMD db.
    node_uid: Uid of the manual node to be resumed.

  Raises:
    status_lib.StatusNotOkError: Failure to resume a manual node.
  """
    logging.info('Received request to resume manual node; node uid: %s',
                 node_uid)
    with pstate.PipelineState.load(mlmd_handle,
                                   node_uid.pipeline_uid) as pipeline_state:
        nodes = pstate.get_all_pipeline_nodes(pipeline_state.pipeline)
        filtered_nodes = [
            n for n in nodes if n.node_info.id == node_uid.node_id
        ]
        if len(filtered_nodes) != 1:
            raise status_lib.StatusNotOkError(
                code=status_lib.Code.NOT_FOUND,
                message=(f'Unable to find manual node to resume: {node_uid}'))
        node = filtered_nodes[0]
        node_type = node.node_info.type.name
        if node_type != constants.MANUAL_NODE_TYPE:
            raise status_lib.StatusNotOkError(
                code=status_lib.Code.INVALID_ARGUMENT,
                message=('Unable to resume a non-manual node. '
                         f'Got non-manual node id: {node_uid}'))

    executions = task_gen_utils.get_executions(mlmd_handle, node)
    active_executions = [
        e for e in executions if execution_lib.is_execution_active(e)
    ]
    if not active_executions:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.NOT_FOUND,
            message=(
                f'Unable to find active manual node to resume: {node_uid}'))
    if len(active_executions) > 1:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.INTERNAL,
            message=(f'Unexpected multiple active executions for manual node: '
                     f'{node_uid}'))
    with mlmd_state.mlmd_execution_atomic_op(
            mlmd_handle=mlmd_handle,
            execution_id=active_executions[0].id) as execution:
        completed_state = manual_task_scheduler.ManualNodeState(
            state=manual_task_scheduler.ManualNodeState.COMPLETED)
        completed_state.set_mlmd_value(
            execution.custom_properties.get_or_create(
                manual_task_scheduler.NODE_STATE_PROPERTY_KEY))
    def _generate_task(self, node: pipeline_pb2.PipelineNode) -> task_lib.Task:
        """Generates a node execution task.

    If node execution is not feasible, `None` is returned.

    Args:
      node: The pipeline node for which to generate a task.

    Returns:
      Returns an `ExecNodeTask` if node can be executed. If an error occurs,
      a `FinalizePipelineTask` is returned to abort the pipeline execution.
    """
        executions = task_gen_utils.get_executions(self._mlmd_handle, node)
        result = task_gen_utils.generate_task_from_active_execution(
            self._mlmd_handle, self._pipeline, node, executions)
        if result:
            return result

        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        resolved_info = task_gen_utils.generate_resolved_info(
            self._mlmd_handle, node)
        if resolved_info.input_artifacts is None:
            return task_lib.FinalizePipelineTask(
                pipeline_uid=self._pipeline_state.pipeline_uid,
                status=status_lib.Status(
                    code=status_lib.Code.ABORTED,
                    message=
                    (f'Aborting pipeline execution due to failure to resolve '
                     f'inputs; problematic node uid: {node_uid}')))

        execution = execution_publish_utils.register_execution(
            metadata_handler=self._mlmd_handle,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)
        return task_lib.ExecNodeTask(
            node_uid=node_uid,
            execution=execution,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties,
            output_artifacts=outputs_resolver.generate_output_artifacts(
                execution.id),
            executor_output_uri=outputs_resolver.get_executor_output_uri(
                execution.id),
            stateful_working_dir=outputs_resolver.
            get_stateful_working_directory(execution.id),
            pipeline=self._pipeline)
Exemple #12
0
 def test_get_latest_successful_execution(self):
     otu.fake_transform_output(self._mlmd_connection, self._transform)
     otu.fake_transform_output(self._mlmd_connection, self._transform)
     otu.fake_transform_output(self._mlmd_connection, self._transform)
     with self._mlmd_connection as m:
         execs = sorted(m.store.get_executions(), key=lambda e: e.id)
         execs[2].last_known_state = metadata_store_pb2.Execution.FAILED
         m.store.put_executions([execs[2]])
         execs = sorted(task_gen_utils.get_executions(m, self._transform),
                        key=lambda e: e.id)
         self.assertEqual(
             execs[1],
             task_gen_utils.get_latest_successful_execution(execs))
Exemple #13
0
 def _upstream_nodes_executed(self, node: pipeline_pb2.PipelineNode) -> bool:
   """Returns `True` if all the upstream nodes have been successfully executed."""
   upstream_nodes = [
       node for node_id, node in self._node_map.items()
       if node_id in set(node.upstream_nodes)
   ]
   if not upstream_nodes:
     return True
   for node in upstream_nodes:
     upstream_node_executions = task_gen_utils.get_executions(
         self._mlmd_handle, node)
     if not task_gen_utils.is_latest_execution_successful(
         upstream_node_executions):
       return False
   return True
Exemple #14
0
    def test_get_executions(self):
        with self._mlmd_connection as m:
            for node in [n.pipeline_node for n in self._pipeline.nodes]:
                self.assertEmpty(task_gen_utils.get_executions(m, node))

        # Create executions for the same nodes under different pipeline contexts.
        self._set_pipeline_context('my_pipeline1')
        otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1,
                                 1)
        otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 2,
                                 1)
        otu.fake_transform_output(self._mlmd_connection, self._transform)
        self._set_pipeline_context('my_pipeline2')
        otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1,
                                 1)
        otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 2,
                                 1)
        otu.fake_transform_output(self._mlmd_connection, self._transform)

        # Get all executions across all pipeline contexts.
        with self._mlmd_connection as m:
            all_eg_execs = sorted(m.store.get_executions_by_type(
                self._example_gen.node_info.type.name),
                                  key=lambda e: e.id)
            all_transform_execs = sorted(m.store.get_executions_by_type(
                self._transform.node_info.type.name),
                                         key=lambda e: e.id)

        # Check that correct executions are returned for each node in each pipeline.
        self._set_pipeline_context('my_pipeline1')
        with self._mlmd_connection as m:
            self.assertCountEqual(
                all_eg_execs[0:2],
                task_gen_utils.get_executions(m, self._example_gen))
            self.assertCountEqual(
                all_transform_execs[0:1],
                task_gen_utils.get_executions(m, self._transform))
            self.assertEmpty(task_gen_utils.get_executions(m, self._trainer))
        self._set_pipeline_context('my_pipeline2')
        with self._mlmd_connection as m:
            self.assertCountEqual(
                all_eg_execs[2:],
                task_gen_utils.get_executions(m, self._example_gen))
            self.assertCountEqual(
                all_transform_execs[1:],
                task_gen_utils.get_executions(m, self._transform))
            self.assertEmpty(task_gen_utils.get_executions(m, self._trainer))
    def generate(self) -> List[task_pb2.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        tasks = []
        with self._mlmd_connection as m:
            # TODO(goutham): Cache executions and/or use TaskQueue so that we don't
            # have to make MLMD queries for upstream nodes in each iteration.
            for nodes in layers:
                # Boolean that's set if there's at least one successfully executed node
                # in the current layer.
                executed_nodes = False
                for node in nodes:
                    executions = task_gen_utils.get_executions(m, node)
                    if (executions
                            and task_gen_utils.is_latest_execution_successful(
                                executions)):
                        executed_nodes = True
                        continue
                    # If all upstream nodes are executed but current node is not executed,
                    # the node is deemed ready for execution.
                    if self._upstream_nodes_executed(m, node):
                        task = self._generate_task(m, node)
                        if task:
                            tasks.append(task)
                # If there are no executed nodes in the current layer, downstream nodes
                # need not be checked.
                if not executed_nodes:
                    break
        return tasks
Exemple #16
0
def _maybe_enqueue_cancellation_task(mlmd_handle: metadata.Metadata,
                                     pipeline: pipeline_pb2.Pipeline,
                                     node: pipeline_pb2.PipelineNode,
                                     task_queue: tq.TaskQueue,
                                     pause: bool = False) -> bool:
    """Enqueues a node cancellation task if not already stopped.

  If the node has an ExecNodeTask in the task queue, issue a cancellation.
  Otherwise, when pause=False, if the node has an active execution in MLMD but
  no ExecNodeTask enqueued, it may be due to orchestrator restart after stopping
  was initiated but before the schedulers could finish. So, enqueue an
  ExecNodeTask with is_cancelled set to give a chance for the scheduler to
  finish gracefully.

  Args:
    mlmd_handle: A handle to the MLMD db.
    pipeline: The pipeline containing the node to cancel.
    node: The node to cancel.
    task_queue: A `TaskQueue` instance into which any cancellation tasks will be
      enqueued.
    pause: Whether the cancellation is to pause the node rather than cancelling
      the execution.

  Returns:
    `True` if a cancellation task was enqueued. `False` if node is already
    stopped or no cancellation was required.
  """
    exec_node_task_id = task_lib.exec_node_task_id_from_pipeline_node(
        pipeline, node)
    if task_queue.contains_task_id(exec_node_task_id):
        task_queue.enqueue(
            task_lib.CancelNodeTask(
                node_uid=task_lib.NodeUid.from_pipeline_node(pipeline, node),
                pause=pause))
        return True
    if not pause:
        executions = task_gen_utils.get_executions(mlmd_handle, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            mlmd_handle, pipeline, node, executions, is_cancelled=True)
        if exec_node_task:
            task_queue.enqueue(exec_node_task)
            return True
    return False
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_pb2.Task]:
        """Generates a node execution task.

    If node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        if not task_gen_utils.is_feasible_node(node):
            return None

        executions = task_gen_utils.get_executions(metadata_handler, node)
        task = task_gen_utils.generate_task_from_active_execution(
            self._pipeline, node, executions)
        if task:
            return task

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None:
            # TODO(goutham): If the pipeline can't make progress, there should be a
            # standard mechanism to surface it to the user.
            logging.warning(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        return task_gen_utils.create_task(self._pipeline, node, execution)
 def _upstream_nodes_executed(self,
                              cur_node: pipeline_pb2.PipelineNode) -> bool:
     """Returns `True` if all the upstream nodes have been successfully executed."""
     upstream_nodes = [
         node for node_id, node in self._node_map.items()
         if node_id in set(cur_node.upstream_nodes)
     ]
     for node in upstream_nodes:
         if self._service_job_manager.is_pure_service_node(
                 self._pipeline_state, node.node_info.id):
             service_status = self._service_job_manager.ensure_node_services(
                 self._pipeline_state, node.node_info.id)
             if service_status == service_jobs.ServiceStatus.SUCCESS:
                 continue
             else:
                 return False
         node_executions = task_gen_utils.get_executions(
             self._mlmd_handle, node)
         if not task_gen_utils.is_latest_execution_successful(
                 node_executions):
             return False
     return True
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_lib.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        executions = task_gen_utils.get_executions(metadata_handler, node)
        result = task_gen_utils.generate_task_from_active_execution(
            metadata_handler, self._pipeline, node, executions)
        if result:
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None or not any(
                resolved_info.input_artifacts.values()):
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        # If the latest successful execution had the same resolved input artifacts,
        # the component should not be triggered, so task is not generated.
        # TODO(b/170231077): This logic should be handled by the resolver when it's
        # implemented. Also, currently only the artifact ids of previous execution
        # are checked to decide if a new execution is warranted but it may also be
        # necessary to factor in the difference of execution properties.
        latest_exec = task_gen_utils.get_latest_successful_execution(
            executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(
                    *resolved_info.input_artifacts.values()))
            if latest_exec_input_artifact_ids == current_exec_input_artifact_ids:
                return None

        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)

        # For mixed service nodes, we ensure node services and check service
        # status; the node is aborted if its service jobs have failed.
        service_status = self._ensure_node_services_if_mixed(node.node_info.id)
        if service_status is not None:
            if service_status != service_jobs.ServiceStatus.RUNNING:
                return self._abort_node_task(node_uid)

        return task_lib.ExecNodeTask(
            node_uid=node_uid,
            execution=execution,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties,
            output_artifacts=outputs_resolver.generate_output_artifacts(
                execution.id),
            executor_output_uri=outputs_resolver.get_executor_output_uri(
                execution.id),
            stateful_working_dir=outputs_resolver.
            get_stateful_working_directory(execution.id),
            pipeline=self._pipeline)
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for layer_num, nodes in enumerate(layers):
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            completed_node_ids = set()
            for node in nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id
                if self._service_job_manager.is_pure_service_node(
                        self._pipeline_state, node.node_info.id):
                    if not self._upstream_nodes_executed(node):
                        continue
                    service_status = self._service_job_manager.ensure_node_services(
                        self._pipeline_state, node_id)
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node completed successfully: %s',
                                     node_uid)
                        completed_node_ids.add(node_id)
                    elif service_status == service_jobs.ServiceStatus.FAILED:
                        logging.error('Failed service node: %s', node_uid)
                        return [
                            task_lib.FinalizePipelineTask(
                                pipeline_uid=self._pipeline_state.pipeline_uid,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=
                                    (f'Aborting pipeline execution due to service '
                                     f'node failure; failed node uid: {node_uid}'
                                     )))
                        ]
                    else:
                        logging.info('Pure service node in progress: %s',
                                     node_uid)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    completed_node_ids.add(node_id)
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)
            # If there are no completed nodes in the current layer, downstream nodes
            # need not be checked.
            if not completed_node_ids:
                break
            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if layer_num == len(layers) - 1 and completed_node_ids == set(
                    node.node_info.id for node in nodes):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_state.pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
Exemple #21
0
    def _generate_tasks_for_node(
            self, node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]:
        """Generates list of tasks for the given node."""
        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        node_id = node.node_info.id
        result = []

        node_state = self._node_states_dict[node_uid]
        if node_state.state in (pstate.NodeState.STOPPING,
                                pstate.NodeState.STOPPED):
            logging.info(
                'Ignoring node in state \'%s\' for task generation: %s',
                node_state.state, node_uid)
            return result

        # If this is a pure service node, there is no ExecNodeTask to generate
        # but we ensure node services and check service status.
        service_status = self._ensure_node_services_if_pure(node_id)
        if service_status is not None:
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            elif service_status == service_jobs.ServiceStatus.SUCCESS:
                logging.info('Service node successful: %s', node_uid)
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.COMPLETE))
            elif service_status == service_jobs.ServiceStatus.RUNNING:
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.RUNNING))
            return result

        # If a task for the node is already tracked by the task queue, it need
        # not be considered for generation again but we ensure node services
        # in case of a mixed service node.
        if self._is_task_id_tracked_fn(
                task_lib.exec_node_task_id_from_pipeline_node(
                    self._pipeline, node)):
            service_status = self._ensure_node_services_if_mixed(node_id)
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'associated service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        node_executions = task_gen_utils.get_executions(
            self._mlmd_handle, node)
        latest_execution = task_gen_utils.get_latest_execution(node_executions)

        # If the latest execution is successful, we're done.
        if latest_execution and execution_lib.is_execution_successful(
                latest_execution):
            logging.info('Node successful: %s', node_uid)
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.COMPLETE))
            return result

        # If the latest execution failed or cancelled, the pipeline should be
        # aborted if the node is not in state STARTING. For nodes that are
        # in state STARTING, a new execution is created.
        if (latest_execution
                and not execution_lib.is_execution_active(latest_execution)
                and node_state.state != pstate.NodeState.STARTING):
            error_msg_value = latest_execution.custom_properties.get(
                constants.EXECUTION_ERROR_MSG_KEY)
            error_msg = data_types_utils.get_metadata_value(
                error_msg_value) if error_msg_value else ''
            error_msg = f'node failed; node uid: {node_uid}; error: {error_msg}'
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.FAILED,
                                             status=status_lib.Status(
                                                 code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            self._mlmd_handle, self._pipeline, node, node_executions)
        if exec_node_task:
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.RUNNING))
            result.append(exec_node_task)
            return result

        # Finally, we are ready to generate tasks for the node by resolving inputs.
        result.extend(self._resolve_inputs_and_generate_tasks_for_node(node))
        return result
Exemple #22
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
    def _generate_tasks_for_node(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        result = []
        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)

        executions = task_gen_utils.get_executions(metadata_handler, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            metadata_handler, self._pipeline, node, executions)
        if exec_node_task:
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.RUNNING))
            result.append(exec_node_task)
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        # TODO(b/207038460): Update async pipeline to support ForEach.
        if (resolved_info is None or not resolved_info.input_artifacts
                or resolved_info.input_artifacts[0] is None
                or not any(resolved_info.input_artifacts[0].values())):
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return result
        input_artifact = resolved_info.input_artifacts[0]

        executor_spec_fingerprint = hashlib.sha256()
        executor_spec = task_gen_utils.get_executor_spec(
            self._pipeline_state.pipeline, node.node_info.id)
        if executor_spec is not None:
            executor_spec_fingerprint.update(
                executor_spec.SerializeToString(deterministic=True))
        resolved_info.exec_properties[
            constants.
            EXECUTOR_SPEC_FINGERPRINT_KEY] = executor_spec_fingerprint.hexdigest(
            )

        # If the latest execution had the same resolved input artifacts, execution
        # properties and executor specs, we should not trigger a new execution.
        latest_exec = task_gen_utils.get_latest_execution(executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(*input_artifact.values()))
            latest_exec_properties = task_gen_utils.extract_properties(
                latest_exec)
            current_exec_properties = resolved_info.exec_properties
            latest_exec_executor_spec_fp = latest_exec_properties[
                constants.EXECUTOR_SPEC_FINGERPRINT_KEY]
            current_exec_executor_spec_fp = resolved_info.exec_properties[
                constants.EXECUTOR_SPEC_FINGERPRINT_KEY]
            if (latest_exec_input_artifact_ids
                    == current_exec_input_artifact_ids
                    and _exec_properties_match(latest_exec_properties,
                                               current_exec_properties)
                    and latest_exec_executor_spec_fp
                    == current_exec_executor_spec_fp):
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.STARTED))
                return result

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=input_artifact,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)

        # For mixed service nodes, we ensure node services and check service
        # status; the node is aborted if its service jobs have failed.
        service_status = self._ensure_node_services_if_mixed(node.node_info.id)
        if service_status is not None:
            if service_status != service_jobs.ServiceStatus.RUNNING:
                error_msg = f'associated service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
                return result

        output_artifacts = outputs_resolver.generate_output_artifacts(
            execution.id)
        outputs_utils.make_output_dirs(output_artifacts)
        result.append(
            task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                         state=pstate.NodeState.RUNNING))
        result.append(
            task_lib.ExecNodeTask(
                node_uid=node_uid,
                execution_id=execution.id,
                contexts=resolved_info.contexts,
                input_artifacts=input_artifact,
                exec_properties=resolved_info.exec_properties,
                output_artifacts=output_artifacts,
                executor_output_uri=outputs_resolver.get_executor_output_uri(
                    execution.id),
                stateful_working_dir=outputs_resolver.
                get_stateful_working_directory(execution.id),
                tmp_dir=outputs_resolver.make_tmp_dir(execution.id),
                pipeline=self._pipeline))
        return result