コード例 #1
0
    def _generate_task(self, node: pipeline_pb2.PipelineNode) -> task_lib.Task:
        """Generates a node execution task.

    If node execution is not feasible, `None` is returned.

    Args:
      node: The pipeline node for which to generate a task.

    Returns:
      Returns an `ExecNodeTask` if node can be executed. If an error occurs,
      a `FinalizePipelineTask` is returned to abort the pipeline execution.
    """
        executions = task_gen_utils.get_executions(self._mlmd_handle, node)
        result = task_gen_utils.generate_task_from_active_execution(
            self._mlmd_handle, self._pipeline, node, executions)
        if result:
            return result

        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        resolved_info = task_gen_utils.generate_resolved_info(
            self._mlmd_handle, node)
        if resolved_info.input_artifacts is None:
            return task_lib.FinalizePipelineTask(
                pipeline_uid=self._pipeline_state.pipeline_uid,
                status=status_lib.Status(
                    code=status_lib.Code.ABORTED,
                    message=
                    (f'Aborting pipeline execution due to failure to resolve '
                     f'inputs; problematic node uid: {node_uid}')))

        execution = execution_publish_utils.register_execution(
            metadata_handler=self._mlmd_handle,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)
        return task_lib.ExecNodeTask(
            node_uid=node_uid,
            execution=execution,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties,
            output_artifacts=outputs_resolver.generate_output_artifacts(
                execution.id),
            executor_output_uri=outputs_resolver.get_executor_output_uri(
                execution.id),
            stateful_working_dir=outputs_resolver.
            get_stateful_working_directory(execution.id),
            pipeline=self._pipeline)
コード例 #2
0
 def _abort_node_task(
         self, node_uid: task_lib.NodeUid) -> task_lib.FinalizeNodeTask:
     """Returns task to abort the node execution."""
     logging.error(
         'Required service node not running or healthy, node uid: %s',
         node_uid)
     return task_lib.FinalizeNodeTask(
         node_uid=node_uid,
         status=status_lib.Status(
             code=status_lib.Code.ABORTED,
             message=(f'Aborting node execution as the associated service '
                      f'job is not running or healthy; problematic node '
                      f'uid: {node_uid}')))
コード例 #3
0
ファイル: noop_task_scheduler.py プロジェクト: vikrosj/tfx
 def schedule(self) -> ts.TaskSchedulerResult:
   task = typing.cast(task_lib.ExecNodeTask, self.task)
   logging.info('Processing ExecNodeTask: %s', task)
   executor_output = execution_result_pb2.ExecutorOutput()
   executor_output.execution_result.code = status_lib.Code.OK
   for key, artifacts in task.output_artifacts.items():
     for artifact in artifacts:
       executor_output.output_artifacts[key].artifacts.add().CopyFrom(
           artifact.mlmd_artifact)
   result = ts.TaskSchedulerResult(
       status=status_lib.Status(code=status_lib.Code.OK),
       executor_output=executor_output)
   logging.info('Result: %s', result)
   return result
コード例 #4
0
  def test_stop_initiation(self):
    with self._mlmd_connection as m:
      pipeline = _test_pipeline('pipeline1')
      with pstate.PipelineState.new(m, pipeline) as pipeline_state:
        self.assertIsNone(pipeline_state.stop_initiated_reason())
        status = status_lib.Status(
            code=status_lib.Code.CANCELLED, message='foo bar')
        pipeline_state.initiate_stop(status)
        self.assertEqual(status, pipeline_state.stop_initiated_reason())

      # Reload from MLMD and verify.
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        self.assertEqual(status, pipeline_state.stop_initiated_reason())
コード例 #5
0
ファイル: pipeline_ops.py プロジェクト: lahlfors/tfx
def stop_node(
    mlmd_handle: metadata.Metadata,
    node_uid: task_lib.NodeUid,
    timeout_secs: float = DEFAULT_WAIT_FOR_INACTIVATION_TIMEOUT_SECS) -> None:
  """Stops a node in an async pipeline.

  Initiates a node stop operation and waits for the node execution to become
  inactive.

  Args:
    mlmd_handle: A handle to the MLMD db.
    node_uid: Uid of the node to be stopped.
    timeout_secs: Amount of time in seconds to wait for node to stop.

  Raises:
    status_lib.StatusNotOkError: Failure to stop the node.
  """
  with _PIPELINE_OPS_LOCK:
    with pstate.PipelineState.load(mlmd_handle,
                                   node_uid.pipeline_uid) as pipeline_state:
      nodes = pstate.get_all_pipeline_nodes(pipeline_state.pipeline)
      filtered_nodes = [n for n in nodes if n.node_info.id == node_uid.node_id]
      if len(filtered_nodes) != 1:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.INTERNAL,
            message=(
                f'`stop_node` operation failed, unable to find node to stop: '
                f'{node_uid}'))
      node = filtered_nodes[0]
      pipeline_state.initiate_node_stop(
          node_uid,
          status_lib.Status(
              code=status_lib.Code.CANCELLED,
              message='Cancellation requested by client.'))

    executions = task_gen_utils.get_executions(mlmd_handle, node)
    active_executions = [
        e for e in executions if execution_lib.is_execution_active(e)
    ]
    if not active_executions:
      # If there are no active executions, we're done.
      return
    if len(active_executions) > 1:
      raise status_lib.StatusNotOkError(
          code=status_lib.Code.INTERNAL,
          message=(
              f'Unexpected multiple active executions for node: {node_uid}'))
  _wait_for_inactivation(
      mlmd_handle, active_executions[0], timeout_secs=timeout_secs)
コード例 #6
0
ファイル: task_manager_test.py プロジェクト: vikrosj/tfx
  def test_scheduler_failure(self):
    # Register a fake task scheduler that returns a failure status.
    self._register_task_scheduler(
        ts.TaskSchedulerResult(
            status=status_lib.Status(code=status_lib.Code.ABORTED),
            executor_output=None))
    task_manager = self._run_task_manager()
    self.assertTrue(task_manager.done())
    self.assertIsNone(task_manager.exception())

    # Check that the task was processed and MLMD execution marked failed.
    self.assertTrue(self._task_queue.is_empty())
    execution = self._get_execution()
    self.assertEqual(metadata_store_pb2.Execution.FAILED,
                     execution.last_known_state)
コード例 #7
0
ファイル: task_manager_test.py プロジェクト: vikrosj/tfx
  def test_successful_execution(self):
    # Register a fake task scheduler that returns a successful execution result
    # and `OK` task scheduler status.
    self._register_task_scheduler(
        ts.TaskSchedulerResult(
            status=status_lib.Status(code=status_lib.Code.OK),
            executor_output=_make_executor_output(self._task, code=0)))
    task_manager = self._run_task_manager()
    self.assertTrue(task_manager.done())
    self.assertIsNone(task_manager.exception())

    # Check that the task was processed and MLMD execution marked successful.
    self.assertTrue(self._task_queue.is_empty())
    execution = self._get_execution()
    self.assertEqual(metadata_store_pb2.Execution.COMPLETE,
                     execution.last_known_state)
コード例 #8
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for all executable nodes in the async pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if no
    nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        result = []
        for node in [n.pipeline_node for n in self._pipeline.nodes]:
            node_uid = task_lib.NodeUid.from_pipeline_node(
                self._pipeline, node)
            node_id = node.node_info.id
            if node_id in self._ignore_node_ids:
                logging.info('Ignoring node for task generation: %s', node_uid)
                continue

            if self._service_job_manager.is_pure_service_node(
                    self._pipeline_state, node_id):
                service_status = self._service_job_manager.ensure_node_services(
                    self._pipeline_state, node_id)
                if service_status != service_jobs.ServiceStatus.RUNNING:
                    logging.error(
                        'Required service node not running or healthy, node uid: %s',
                        node_uid)
                    result.append(
                        task_lib.FinalizeNodeTask(
                            node_uid=node_uid,
                            status=status_lib.Status(
                                code=status_lib.Code.ABORTED,
                                message=
                                (f'Aborting node execution as the associated service '
                                 f'job is not running or healthy; problematic node '
                                 f'uid: {node_uid}'))))
                continue

            # If a task for the node is already tracked by the task queue, it need
            # not be considered for generation again.
            if self._is_task_id_tracked_fn(
                    task_lib.exec_node_task_id_from_pipeline_node(
                        self._pipeline, node)):
                continue
            task = self._generate_task(self._mlmd_handle, node)
            if task:
                result.append(task)
        return result
コード例 #9
0
    def test_scheduler_failure(self):
        # Register a fake task scheduler that returns a failure status.
        self._register_task_scheduler(
            ts.TaskSchedulerResult(status=status_lib.Status(
                code=status_lib.Code.ABORTED, message='foobar error'),
                                   executor_output=None))
        task_manager = self._run_task_manager()
        self.assertTrue(task_manager.done())
        self.assertIsNone(task_manager.exception())

        # Check that the task was processed and MLMD execution marked failed.
        self.assertTrue(self._task_queue.is_empty())
        execution = self._get_execution()
        self.assertEqual(metadata_store_pb2.Execution.FAILED,
                         execution.last_known_state)
        self.assertEqual(
            'foobar error',
            data_types_utils.get_metadata_value(execution.custom_properties[
                constants.EXECUTION_ERROR_MSG_KEY]))
コード例 #10
0
 def node_stop_initiated_reason(
     self, node_uid: task_lib.NodeUid) -> Optional[status_lib.Status]:
   """Returns status object if node stop initiated, `None` otherwise."""
   if node_uid.pipeline_uid != self.pipeline_uid:
     raise RuntimeError(
         f'Node given by uid {node_uid} does not belong to pipeline given '
         f'by uid {self.pipeline_uid}')
   custom_properties = self.execution.custom_properties
   if _get_metadata_value(
       custom_properties.get(_node_stop_initiated_property(node_uid))) == 1:
     code = _get_metadata_value(
         custom_properties.get(_node_status_code_property(node_uid)))
     if code is None:
       code = status_lib.Code.UNKNOWN
     message = _get_metadata_value(
         custom_properties.get(_node_status_msg_property(node_uid)))
     return status_lib.Status(code=code, message=message)
   else:
     return None
コード例 #11
0
ファイル: task_manager.py プロジェクト: hamzamaiot/tfx
 def _process_exec_node_task(self, scheduler: ts.TaskScheduler,
                             task: task_lib.ExecNodeTask) -> None:
     """Processes an `ExecNodeTask` using the given task scheduler."""
     # This is a blocking call to the scheduler which can take a long time to
     # complete for some types of task schedulers. The scheduler is expected to
     # handle any internal errors gracefully and return the result with an error
     # status. But in case the scheduler raises an exception, it is considered
     # a failed execution and MLMD is updated accordingly.
     try:
         result = scheduler.schedule()
     except Exception as e:  # pylint: disable=broad-except
         logging.info(
             'Exception raised by task scheduler for node uid %s; error: %s',
             task.node_uid, e)
         result = ts.TaskSchedulerResult(status=status_lib.Status(
             code=status_lib.Code.ABORTED, message=str(e)))
     _publish_execution_results(mlmd_handle=self._mlmd_handle,
                                task=task,
                                result=result)
     with self._tm_lock:
         del self._scheduler_by_node_uid[task.node_uid]
         self._task_queue.task_done(task)
コード例 #12
0
ファイル: pipeline_ops_test.py プロジェクト: konny0311/tfx
  def test_stop_pipeline_non_existent_or_inactive(self, pipeline):
    with self._mlmd_connection as m:
      # Stop pipeline without creating one.
      with self.assertRaises(status_lib.StatusNotOkError) as exception_context:
        pipeline_ops.stop_pipeline(m,
                                   task_lib.PipelineUid.from_pipeline(pipeline))
      self.assertEqual(status_lib.Code.NOT_FOUND,
                       exception_context.exception.code)

      # Initiate pipeline start and mark it completed.
      execution = pipeline_ops.initiate_pipeline_start(m, pipeline).execution
      pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline)
      with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state:
        pipeline_state.initiate_stop(status_lib.Status(code=status_lib.Code.OK))
      execution.last_known_state = metadata_store_pb2.Execution.COMPLETE
      m.store.put_executions([execution])

      # Try to initiate stop again.
      with self.assertRaises(status_lib.StatusNotOkError) as exception_context:
        pipeline_ops.stop_pipeline(m, pipeline_uid)
      self.assertEqual(status_lib.Code.NOT_FOUND,
                       exception_context.exception.code)
コード例 #13
0
ファイル: pipeline_ops_test.py プロジェクト: lahlfors/tfx
    def test_handling_finalize_node_task(self, task_gen):
        with self._mlmd_connection as m:
            pipeline = _test_pipeline('pipeline1')
            pipeline.nodes.add().pipeline_node.node_info.id = 'Transform'
            pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer'
            pipeline_ops.initiate_pipeline_start(m, pipeline)
            pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline)
            finalize_reason = status_lib.Status(code=status_lib.Code.ABORTED,
                                                message='foo bar')
            task_gen.return_value.generate.side_effect = [
                [
                    test_utils.create_exec_node_task(
                        task_lib.NodeUid(pipeline_uid=pipeline_uid,
                                         node_id='Transform')),
                    task_lib.FinalizeNodeTask(node_uid=task_lib.NodeUid(
                        pipeline_uid=pipeline_uid, node_id='Trainer'),
                                              status=finalize_reason)
                ],
            ]

            task_queue = tq.TaskQueue()
            pipeline_ops.orchestrate(m, task_queue,
                                     service_jobs.DummyServiceJobManager())
            task_gen.return_value.generate.assert_called_once()
            task = task_queue.dequeue()
            task_queue.task_done(task)
            self.assertTrue(task_lib.is_exec_node_task(task))
            self.assertEqual(
                test_utils.create_node_uid('pipeline1', 'Transform'),
                task.node_uid)

            # Load pipeline state and verify node stop initiation.
            with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state:
                self.assertEqual(
                    finalize_reason,
                    pipeline_state.node_stop_initiated_reason(
                        task_lib.NodeUid(pipeline_uid=pipeline_uid,
                                         node_id='Trainer')))
コード例 #14
0
ファイル: task_manager.py プロジェクト: jeongukjae/tfx
def _publish_execution_results(mlmd_handle: metadata.Metadata,
                               task: task_lib.ExecNodeTask,
                               result: ts.TaskSchedulerResult) -> None:
    """Publishes execution results to MLMD."""
    def _update_state(status: status_lib.Status) -> None:
        assert status.code != status_lib.Code.OK
        if status.code == status_lib.Code.CANCELLED:
            logging.info(
                'Cancelling execution (id: %s); task id: %s; status: %s',
                task.execution.id, task.task_id, status)
            execution_state = metadata_store_pb2.Execution.CANCELED
        else:
            logging.info(
                'Aborting execution (id: %s) due to error (code: %s); task id: %s',
                task.execution.id, status.code, task.task_id)
            execution_state = metadata_store_pb2.Execution.FAILED
        _update_execution_state_in_mlmd(mlmd_handle, task.execution,
                                        execution_state, status.message)

    if result.status.code != status_lib.Code.OK:
        _update_state(result.status)
        return

    publish_params = dict(output_artifacts=task.output_artifacts)
    if result.output_artifacts is not None:
        publish_params['output_artifacts'] = result.output_artifacts
    elif result.executor_output is not None:
        if result.executor_output.execution_result.code != status_lib.Code.OK:
            _update_state(
                status_lib.Status(
                    code=result.executor_output.execution_result.code,
                    message=result.executor_output.execution_result.
                    result_message))
            return
        publish_params['executor_output'] = result.executor_output

    execution_publish_utils.publish_succeeded_execution(
        mlmd_handle, task.execution.id, task.contexts, **publish_params)
コード例 #15
0
def _publish_execution_results(mlmd_handle: metadata.Metadata,
                               task: task_lib.ExecNodeTask,
                               result: ts.TaskSchedulerResult) -> None:
  """Publishes execution results to MLMD."""

  def _update_state(status: status_lib.Status) -> None:
    assert status.code != status_lib.Code.OK
    if status.code == status_lib.Code.CANCELLED:
      execution_state = metadata_store_pb2.Execution.CANCELED
      state_msg = 'cancelled'
    else:
      execution_state = metadata_store_pb2.Execution.FAILED
      state_msg = 'failed'
    logging.info(
        'Got error (status: %s) for task id: %s; marking execution (id: %s) '
        'as %s.', status, task.task_id, task.execution.id, state_msg)
    # TODO(goutham): Also record error code and error message as custom property
    # of the execution.
    _update_execution_state_in_mlmd(mlmd_handle, task.execution,
                                    execution_state)

  if result.status.code != status_lib.Code.OK:
    _update_state(result.status)
    return

  if (result.executor_output and
      result.executor_output.execution_result.code != status_lib.Code.OK):
    _update_state(status_lib.Status(
        code=result.executor_output.execution_result.code,
        message=result.executor_output.execution_result.result_message))
    return

  execution_publish_utils.publish_succeeded_execution(mlmd_handle,
                                                      task.execution.id,
                                                      task.contexts,
                                                      task.output_artifacts,
                                                      result.executor_output)
コード例 #16
0
ファイル: pipeline_ops_test.py プロジェクト: konny0311/tfx
  def test_handling_finalize_pipeline_task(self, task_gen):
    with self._mlmd_connection as m:
      pipeline = _test_pipeline('pipeline1', pipeline_pb2.Pipeline.SYNC)
      pipeline_ops.initiate_pipeline_start(m, pipeline)
      pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline)
      finalize_reason = status_lib.Status(
          code=status_lib.Code.ABORTED, message='foo bar')
      task_gen.return_value.generate.side_effect = [
          [
              task_lib.FinalizePipelineTask(
                  pipeline_uid=pipeline_uid, status=finalize_reason)
          ],
      ]

      task_queue = tq.TaskQueue()
      pipeline_ops.orchestrate(m, task_queue,
                               service_jobs.DummyServiceJobManager())
      task_gen.return_value.generate.assert_called_once()
      self.assertTrue(task_queue.is_empty())

      # Load pipeline state and verify stop initiation.
      with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state:
        self.assertEqual(finalize_reason,
                         pipeline_state.stop_initiated_reason())
コード例 #17
0
  def test_initiate_node_start_stop(self):
    with self._mlmd_connection as m:
      pipeline = _test_pipeline('pipeline1')
      node_uid = task_lib.NodeUid(
          node_id='Trainer',
          pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline))
      with pstate.PipelineState.new(m, pipeline) as pipeline_state:
        pipeline_state.initiate_node_start(node_uid)
        self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))

      # Reload from MLMD and verify node is started.
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))

        # Stop the node.
        status = status_lib.Status(
            code=status_lib.Code.ABORTED, message='foo bar')
        pipeline_state.initiate_node_stop(node_uid, status)
        self.assertEqual(status,
                         pipeline_state.node_stop_initiated_reason(node_uid))

      # Reload from MLMD and verify node is stopped.
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        self.assertEqual(status,
                         pipeline_state.node_stop_initiated_reason(node_uid))

        # Restart node.
        pipeline_state.initiate_node_start(node_uid)
        self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))

      # Reload from MLMD and verify node is started.
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))
コード例 #18
0
ファイル: pipeline_ops_test.py プロジェクト: konny0311/tfx
  def test_active_pipelines_with_stop_initiated_nodes(self,
                                                      mock_gen_task_from_active,
                                                      mock_async_task_gen):
    with self._mlmd_connection as m:
      pipeline = _test_pipeline('pipeline')
      pipeline.nodes.add().pipeline_node.node_info.id = 'ExampleGen'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Transform'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Evaluator'

      mock_service_job_manager = mock.create_autospec(
          service_jobs.ServiceJobManager, instance=True)
      mock_service_job_manager.is_pure_service_node.side_effect = (
          lambda _, node_id: node_id == 'ExampleGen')
      example_gen_node_uid = task_lib.NodeUid.from_pipeline_node(
          pipeline, pipeline.nodes[0].pipeline_node)

      transform_node_uid = task_lib.NodeUid.from_pipeline_node(
          pipeline, pipeline.nodes[1].pipeline_node)
      transform_task = test_utils.create_exec_node_task(
          node_uid=transform_node_uid)

      trainer_node_uid = task_lib.NodeUid.from_pipeline_node(
          pipeline, pipeline.nodes[2].pipeline_node)
      trainer_task = test_utils.create_exec_node_task(node_uid=trainer_node_uid)

      evaluator_node_uid = task_lib.NodeUid.from_pipeline_node(
          pipeline, pipeline.nodes[3].pipeline_node)
      evaluator_task = test_utils.create_exec_node_task(
          node_uid=evaluator_node_uid)
      cancelled_evaluator_task = test_utils.create_exec_node_task(
          node_uid=evaluator_node_uid, is_cancelled=True)

      pipeline_ops.initiate_pipeline_start(m, pipeline)
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        # Stop example-gen, trainer and evaluator.
        pipeline_state.initiate_node_stop(
            example_gen_node_uid,
            status_lib.Status(code=status_lib.Code.CANCELLED))
        pipeline_state.initiate_node_stop(
            trainer_node_uid, status_lib.Status(code=status_lib.Code.CANCELLED))
        pipeline_state.initiate_node_stop(
            evaluator_node_uid, status_lib.Status(code=status_lib.Code.ABORTED))

      task_queue = tq.TaskQueue()

      # Simulate a new transform execution being triggered.
      mock_async_task_gen.return_value.generate.return_value = [transform_task]
      # Simulate ExecNodeTask for trainer already present in the task queue.
      task_queue.enqueue(trainer_task)
      # Simulate Evaluator having an active execution in MLMD.
      mock_gen_task_from_active.side_effect = [evaluator_task]

      pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager)
      self.assertEqual(1, mock_async_task_gen.return_value.generate.call_count)

      # stop_node_services should be called on example-gen which is a pure
      # service node.
      mock_service_job_manager.stop_node_services.assert_called_once_with(
          mock.ANY, 'ExampleGen')

      # Verify that tasks are enqueued in the expected order:

      # Pre-existing trainer task.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertEqual(trainer_task, task)

      # CancelNodeTask for trainer.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertTrue(task_lib.is_cancel_node_task(task))
      self.assertEqual(trainer_node_uid, task.node_uid)

      # ExecNodeTask with is_cancelled=True for evaluator.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertTrue(cancelled_evaluator_task, task)

      # ExecNodeTask for newly triggered transform node.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertEqual(transform_task, task)

      # No more tasks.
      self.assertTrue(task_queue.is_empty())
コード例 #19
0
ファイル: pipeline_ops_test.py プロジェクト: konny0311/tfx
  def test_stop_initiated_pipelines(self, pipeline, mock_gen_task_from_active,
                                    mock_async_task_gen, mock_sync_task_gen):
    with self._mlmd_connection as m:
      pipeline.nodes.add().pipeline_node.node_info.id = 'ExampleGen'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Transform'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer'
      pipeline.nodes.add().pipeline_node.node_info.id = 'Evaluator'

      mock_service_job_manager = mock.create_autospec(
          service_jobs.ServiceJobManager, instance=True)
      mock_service_job_manager.is_pure_service_node.side_effect = (
          lambda _, node_id: node_id == 'ExampleGen')
      mock_service_job_manager.is_mixed_service_node.side_effect = (
          lambda _, node_id: node_id == 'Transform')

      pipeline_ops.initiate_pipeline_start(m, pipeline)
      with pstate.PipelineState.load(
          m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state:
        pipeline_state.initiate_stop(
            status_lib.Status(code=status_lib.Code.CANCELLED))
      pipeline_execution = pipeline_state.execution

      task_queue = tq.TaskQueue()

      # For the stop-initiated pipeline, "Transform" execution task is in queue,
      # "Trainer" has an active execution in MLMD but no task in queue,
      # "Evaluator" has no active execution.
      task_queue.enqueue(
          test_utils.create_exec_node_task(
              task_lib.NodeUid(
                  pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline),
                  node_id='Transform')))
      transform_task = task_queue.dequeue()  # simulates task being processed
      mock_gen_task_from_active.side_effect = [
          test_utils.create_exec_node_task(
              node_uid=task_lib.NodeUid(
                  pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline),
                  node_id='Trainer'),
              is_cancelled=True), None, None, None, None
      ]

      pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager)

      # There are no active pipelines so these shouldn't be called.
      mock_async_task_gen.assert_not_called()
      mock_sync_task_gen.assert_not_called()

      # stop_node_services should be called for ExampleGen which is a pure
      # service node.
      mock_service_job_manager.stop_node_services.assert_called_once_with(
          mock.ANY, 'ExampleGen')
      mock_service_job_manager.reset_mock()

      task_queue.task_done(transform_task)  # Pop out transform task.

      # CancelNodeTask for the "Transform" ExecNodeTask should be next.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertTrue(task_lib.is_cancel_node_task(task))
      self.assertEqual('Transform', task.node_uid.node_id)

      # ExecNodeTask (with is_cancelled=True) for "Trainer" is next.
      task = task_queue.dequeue()
      task_queue.task_done(task)
      self.assertTrue(task_lib.is_exec_node_task(task))
      self.assertEqual('Trainer', task.node_uid.node_id)
      self.assertTrue(task.is_cancelled)

      self.assertTrue(task_queue.is_empty())

      mock_gen_task_from_active.assert_has_calls([
          mock.call(
              m,
              pipeline_state.pipeline,
              pipeline.nodes[2].pipeline_node,
              mock.ANY,
              is_cancelled=True),
          mock.call(
              m,
              pipeline_state.pipeline,
              pipeline.nodes[3].pipeline_node,
              mock.ANY,
              is_cancelled=True)
      ])
      self.assertEqual(2, mock_gen_task_from_active.call_count)

      # Pipeline execution should continue to be active since active node
      # executions were found in the last call to `orchestrate`.
      [execution] = m.store.get_executions_by_id([pipeline_execution.id])
      self.assertTrue(execution_lib.is_execution_active(execution))

      # Call `orchestrate` again; this time there are no more active node
      # executions so the pipeline should be marked as cancelled.
      pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager)
      self.assertTrue(task_queue.is_empty())
      [execution] = m.store.get_executions_by_id([pipeline_execution.id])
      self.assertEqual(metadata_store_pb2.Execution.CANCELED,
                       execution.last_known_state)

      # stop_node_services should be called on both ExampleGen and Transform
      # which are service nodes.
      mock_service_job_manager.stop_node_services.assert_has_calls(
          [mock.call(mock.ANY, 'ExampleGen'),
           mock.call(mock.ANY, 'Transform')],
          any_order=True)
コード例 #20
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
コード例 #21
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for layer_num, nodes in enumerate(layers):
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            completed_node_ids = set()
            for node in nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id
                if self._service_job_manager.is_pure_service_node(
                        self._pipeline_state, node.node_info.id):
                    if not self._upstream_nodes_executed(node):
                        continue
                    service_status = self._service_job_manager.ensure_node_services(
                        self._pipeline_state, node_id)
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node completed successfully: %s',
                                     node_uid)
                        completed_node_ids.add(node_id)
                    elif service_status == service_jobs.ServiceStatus.FAILED:
                        logging.error('Failed service node: %s', node_uid)
                        return [
                            task_lib.FinalizePipelineTask(
                                pipeline_uid=self._pipeline_state.pipeline_uid,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=
                                    (f'Aborting pipeline execution due to service '
                                     f'node failure; failed node uid: {node_uid}'
                                     )))
                        ]
                    else:
                        logging.info('Pure service node in progress: %s',
                                     node_uid)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    completed_node_ids.add(node_id)
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)
            # If there are no completed nodes in the current layer, downstream nodes
            # need not be checked.
            if not completed_node_ids:
                break
            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if layer_num == len(layers) - 1 and completed_node_ids == set(
                    node.node_info.id for node in nodes):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_state.pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result