コード例 #1
0
    def test_pipeline_failure_strategies(self, fail_fast):
        """Tests pipeline failure strategies."""
        test_utils.fake_example_gen_run(self._mlmd_connection,
                                        self._example_gen, 1, 1)

        self._run_next(False,
                       expect_nodes=[self._stats_gen],
                       fail_fast=fail_fast)
        self._run_next(False,
                       expect_nodes=[self._schema_gen],
                       fail_fast=fail_fast)

        # Both example-validator and transform are ready to execute.
        [example_validator_task,
         transform_task] = self._generate(False, True, fail_fast=fail_fast)
        self.assertEqual(self._example_validator.node_info.id,
                         example_validator_task.node_uid.node_id)
        self.assertEqual(self._transform.node_info.id,
                         transform_task.node_uid.node_id)

        # Simulate Transform success.
        self._finish_node_execution(False, transform_task)

        # But fail example-validator.
        with self._mlmd_connection as m:
            with mlmd_state.mlmd_execution_atomic_op(
                    m, example_validator_task.execution_id) as ev_exec:
                # Fail stats-gen execution.
                ev_exec.last_known_state = metadata_store_pb2.Execution.FAILED
                data_types_utils.set_metadata_value(
                    ev_exec.custom_properties[
                        constants.EXECUTION_ERROR_MSG_KEY],
                    'example-validator error')

        if fail_fast:
            # Pipeline run should immediately fail because example-validator failed.
            [finalize_task] = self._generate(False, True, fail_fast=fail_fast)
            self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
            self.assertEqual(status_lib.Code.ABORTED,
                             finalize_task.status.code)
        else:
            # Trainer and downstream nodes can execute as transform has finished.
            # example-validator failure does not impact them as it is not upstream.
            # Pipeline run will still fail but when no more progress can be made.
            self._run_next(False,
                           expect_nodes=[self._trainer],
                           fail_fast=fail_fast)
            self._run_next(False,
                           expect_nodes=[self._chore_a],
                           fail_fast=fail_fast)
            self._run_next(False,
                           expect_nodes=[self._chore_b],
                           fail_fast=fail_fast)
            [finalize_task] = self._generate(False, True, fail_fast=fail_fast)
            self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
            self.assertEqual(status_lib.Code.ABORTED,
                             finalize_task.status.code)
コード例 #2
0
    def test_service_job_failed(self):
        """Tests task generation when example-gen service job fails."""
        def _ensure_node_services(unused_pipeline_state, node_id):
            self.assertEqual('my_example_gen', node_id)
            return service_jobs.ServiceStatus.FAILED

        self._mock_service_job_manager.ensure_node_services.side_effect = (
            _ensure_node_services)
        [finalize_task], _ = self._generate_and_test(True,
                                                     num_initial_executions=0,
                                                     num_tasks_generated=1,
                                                     num_new_executions=0,
                                                     num_active_executions=0)
        self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
        self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code)
        self.assertRegexMatch(finalize_task.status.message,
                              ['service job failed'])
コード例 #3
0
  def test_node_failed(self, use_task_queue):
    """Tests task generation when a node registers a failed execution."""
    otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1)

    def _ensure_node_services(unused_pipeline_state, node_id):
      self.assertEqual(self._example_gen.node_info.id, node_id)
      return service_jobs.ServiceStatus.SUCCESS

    self._mock_service_job_manager.ensure_node_services.side_effect = (
        _ensure_node_services)

    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=1,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1)
    self.assertEqual(
        task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen),
        tasks[0].node_uid)
    stats_gen_exec = active_executions[0]

    # Fail stats-gen execution.
    stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED
    data_types_utils.set_metadata_value(
        stats_gen_exec.custom_properties[constants.EXECUTION_ERROR_MSG_KEY],
        'foobar error')
    with self._mlmd_connection as m:
      m.store.put_executions([stats_gen_exec])
    if use_task_queue:
      task = self._task_queue.dequeue()
      self._task_queue.task_done(task)

    # Test generation of FinalizePipelineTask.
    tasks, _ = self._generate_and_test(
        True,
        num_initial_executions=2,
        num_tasks_generated=1,
        num_new_executions=0,
        num_active_executions=0)
    self.assertLen(tasks, 1)
    self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0]))
    self.assertEqual(status_lib.Code.ABORTED, tasks[0].status.code)
    self.assertRegexMatch(tasks[0].status.message, ['foobar error'])
コード例 #4
0
    def test_node_failed(self, fail_fast):
        """Tests task generation when a node registers a failed execution."""
        test_utils.fake_example_gen_run(self._mlmd_connection,
                                        self._example_gen, 1, 1)

        [stats_gen_task
         ] = self._generate_and_test(False,
                                     num_initial_executions=1,
                                     num_tasks_generated=1,
                                     num_new_executions=1,
                                     num_active_executions=1,
                                     ignore_update_node_state_tasks=True,
                                     fail_fast=fail_fast)
        self.assertEqual(
            task_lib.NodeUid.from_pipeline_node(self._pipeline,
                                                self._stats_gen),
            stats_gen_task.node_uid)
        with self._mlmd_connection as m:
            with mlmd_state.mlmd_execution_atomic_op(
                    m, stats_gen_task.execution_id) as stats_gen_exec:
                # Fail stats-gen execution.
                stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED
                data_types_utils.set_metadata_value(
                    stats_gen_exec.custom_properties[
                        constants.EXECUTION_ERROR_MSG_KEY], 'foobar error')

        # Test generation of FinalizePipelineTask.
        [update_node_state_task,
         finalize_task] = self._generate_and_test(True,
                                                  num_initial_executions=2,
                                                  num_tasks_generated=2,
                                                  num_new_executions=0,
                                                  num_active_executions=0,
                                                  fail_fast=fail_fast)
        self.assertTrue(
            task_lib.is_update_node_state_task(update_node_state_task))
        self.assertEqual('my_statistics_gen',
                         update_node_state_task.node_uid.node_id)
        self.assertEqual(pstate.NodeState.FAILED, update_node_state_task.state)
        self.assertRegexMatch(update_node_state_task.status.message,
                              ['foobar error'])
        self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
        self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code)
        self.assertRegexMatch(finalize_task.status.message, ['foobar error'])
コード例 #5
0
  def test_tasks_generated_when_upstream_done(self, use_task_queue):
    """Tests that tasks are generated when upstream is done.

    Args:
      use_task_queue: If task queue is enabled, new tasks are only generated if
        a task with the same task_id does not already exist in the queue.
        `use_task_queue=False` is useful to test the case of task generation
        when task queue is empty (for eg: due to orchestrator restart).
    """
    # Simulate that ExampleGen has already completed successfully.
    otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1)

    def _ensure_node_services(unused_pipeline_state, node_id):
      self.assertIn(
          node_id,
          (self._example_gen.node_info.id, self._transform.node_info.id))
      return service_jobs.ServiceStatus.SUCCESS

    self._mock_service_job_manager.ensure_node_services.side_effect = (
        _ensure_node_services)

    # Generate once. Stats-gen task should be generated.
    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=1,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1)
    execution_id = active_executions[0].id
    self._verify_exec_node_task(self._stats_gen, execution_id, tasks[0])

    self._mock_service_job_manager.ensure_node_services.assert_called_with(
        mock.ANY, self._example_gen.node_info.id)
    self._mock_service_job_manager.reset_mock()

    # Finish stats-gen execution.
    self._finish_node_execution(use_task_queue, self._stats_gen,
                                active_executions[0])

    # Schema-gen should execute next.
    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=2,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1)
    execution_id = active_executions[0].id
    self._verify_exec_node_task(self._schema_gen, execution_id, tasks[0])

    # Finish schema-gen execution.
    self._finish_node_execution(use_task_queue, self._schema_gen,
                                active_executions[0])

    # Transform and ExampleValidator should both execute next.
    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=3,
        num_tasks_generated=2,
        num_new_executions=2,
        num_active_executions=2)
    self._verify_exec_node_task(self._example_validator,
                                active_executions[0].id, tasks[0])
    transform_exec = active_executions[1]
    self._verify_exec_node_task(self._transform, transform_exec.id, tasks[1])

    # Transform is a "mixed service node".
    self._mock_service_job_manager.ensure_node_services.assert_called_once_with(
        mock.ANY, self._transform.node_info.id)
    self._mock_service_job_manager.reset_mock()

    # Finish example-validator execution.
    self._finish_node_execution(use_task_queue, self._example_validator,
                                active_executions[0])

    # Since transform hasn't finished, trainer will not be triggered yet.
    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=5,
        num_tasks_generated=0 if use_task_queue else 1,
        num_new_executions=0,
        num_active_executions=1)
    if not use_task_queue:
      self._verify_exec_node_task(self._transform, active_executions[0].id,
                                  tasks[0])

    # Finish transform execution.
    self._finish_node_execution(use_task_queue, self._transform, transform_exec)

    # Now all trainer upstream nodes are done, so trainer will be triggered.
    tasks, active_executions = self._generate_and_test(
        use_task_queue,
        num_initial_executions=5,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1)
    self._verify_exec_node_task(self._trainer, active_executions[0].id,
                                tasks[0])

    # Finish trainer execution.
    self._finish_node_execution(use_task_queue, self._trainer,
                                active_executions[0])

    # No more components to execute, FinalizePipelineTask should be generated.
    tasks, _ = self._generate_and_test(
        use_task_queue,
        num_initial_executions=6,
        num_tasks_generated=1,
        num_new_executions=0,
        num_active_executions=0)
    self.assertLen(tasks, 1)
    self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0]))
    self.assertEqual(status_lib.Code.OK, tasks[0].status.code)
    if use_task_queue:
      self.assertTrue(self._task_queue.is_empty())
コード例 #6
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for layer_num, nodes in enumerate(layers):
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            completed_node_ids = set()
            for node in nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id
                if self._service_job_manager.is_pure_service_node(
                        self._pipeline_state, node.node_info.id):
                    if not self._upstream_nodes_executed(node):
                        continue
                    service_status = self._service_job_manager.ensure_node_services(
                        self._pipeline_state, node_id)
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node completed successfully: %s',
                                     node_uid)
                        completed_node_ids.add(node_id)
                    elif service_status == service_jobs.ServiceStatus.FAILED:
                        logging.error('Failed service node: %s', node_uid)
                        return [
                            task_lib.FinalizePipelineTask(
                                pipeline_uid=self._pipeline_state.pipeline_uid,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=
                                    (f'Aborting pipeline execution due to service '
                                     f'node failure; failed node uid: {node_uid}'
                                     )))
                        ]
                    else:
                        logging.info('Pure service node in progress: %s',
                                     node_uid)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if (executions
                        and task_gen_utils.is_latest_execution_successful(
                            executions)):
                    completed_node_ids.add(node_id)
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node)
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)
            # If there are no completed nodes in the current layer, downstream nodes
            # need not be checked.
            if not completed_node_ids:
                break
            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if layer_num == len(layers) - 1 and completed_node_ids == set(
                    node.node_info.id for node in nodes):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_state.pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
コード例 #7
0
def _orchestrate_active_pipeline(
        mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue,
        service_job_manager: service_jobs.ServiceJobManager,
        pipeline_state: pstate.PipelineState) -> None:
    """Orchestrates active pipeline."""
    pipeline = pipeline_state.pipeline
    execution = pipeline_state.execution
    assert execution.last_known_state in (metadata_store_pb2.Execution.NEW,
                                          metadata_store_pb2.Execution.RUNNING)
    if execution.last_known_state != metadata_store_pb2.Execution.RUNNING:
        updated_execution = copy.deepcopy(execution)
        updated_execution.last_known_state = metadata_store_pb2.Execution.RUNNING
        mlmd_handle.store.put_executions([updated_execution])

    # Initialize task generator for the pipeline.
    if pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC:
        generator = sync_pipeline_task_gen.SyncPipelineTaskGenerator(
            mlmd_handle, pipeline_state, task_queue.contains_task_id,
            service_job_manager)
    elif pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC:
        # Create cancellation tasks for stop-initiated nodes if necessary.
        stop_initiated_nodes = _get_stop_initiated_nodes(pipeline_state)
        for node in stop_initiated_nodes:
            if service_job_manager.is_pure_service_node(
                    pipeline_state, node.node_info.id):
                service_job_manager.stop_node_services(pipeline_state,
                                                       node.node_info.id)
            elif _maybe_enqueue_cancellation_task(mlmd_handle, pipeline, node,
                                                  task_queue):
                pass
            elif service_job_manager.is_mixed_service_node(
                    pipeline_state, node.node_info.id):
                service_job_manager.stop_node_services(pipeline_state,
                                                       node.node_info.id)
        generator = async_pipeline_task_gen.AsyncPipelineTaskGenerator(
            mlmd_handle, pipeline_state, task_queue.contains_task_id,
            service_job_manager,
            set(n.node_info.id for n in stop_initiated_nodes))
    else:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.FAILED_PRECONDITION,
            message=(
                f'Only SYNC and ASYNC pipeline execution modes supported; '
                f'found pipeline with execution mode: {pipeline.execution_mode}'
            ))

    tasks = generator.generate()

    with pipeline_state:
        for task in tasks:
            if task_lib.is_exec_node_task(task):
                task = typing.cast(task_lib.ExecNodeTask, task)
                task_queue.enqueue(task)
            elif task_lib.is_finalize_node_task(task):
                assert pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC
                task = typing.cast(task_lib.FinalizeNodeTask, task)
                pipeline_state.initiate_node_stop(task.node_uid, task.status)
            else:
                assert task_lib.is_finalize_pipeline_task(task)
                assert pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC
                assert len(tasks) == 1
                task = typing.cast(task_lib.FinalizePipelineTask, task)
                if task.status.code == status_lib.Code.OK:
                    logging.info('Pipeline run successful; pipeline uid: %s',
                                 pipeline_state.pipeline_uid)
                else:
                    logging.info('Pipeline run failed; pipeline uid: %s',
                                 pipeline_state.pipeline_uid)
                pipeline_state.initiate_stop(task.status)
コード例 #8
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
コード例 #9
0
  def test_tasks_generated_when_upstream_done(self, use_task_queue):
    """Tests that tasks are generated when upstream is done.

    Args:
      use_task_queue: If task queue is enabled, new tasks are only generated if
        a task with the same task_id does not already exist in the queue.
        `use_task_queue=False` is useful to test the case of task generation
        when task queue is empty (for eg: due to orchestrator restart).
    """
    # Simulate that ExampleGen has already completed successfully.
    otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1)

    # Generate once. Stats-gen task should be generated.
    [stats_gen_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=1,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1,
        expected_exec_nodes=[self._stats_gen])

    self._mock_service_job_manager.ensure_node_services.assert_called_with(
        mock.ANY, self._example_gen.node_info.id)
    self._mock_service_job_manager.reset_mock()

    # Finish stats-gen execution.
    self._finish_node_execution(use_task_queue, stats_gen_task)

    # Schema-gen should execute next.
    [schema_gen_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=2,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1,
        expected_exec_nodes=[self._schema_gen])

    # Finish schema-gen execution.
    self._finish_node_execution(use_task_queue, schema_gen_task)

    # Transform and ExampleValidator should both execute next.
    [example_validator_task, transform_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=3,
        num_tasks_generated=2,
        num_new_executions=2,
        num_active_executions=2,
        expected_exec_nodes=[self._example_validator, self._transform])

    # Transform is a "mixed service node".
    self._mock_service_job_manager.ensure_node_services.assert_called_once_with(
        mock.ANY, self._transform.node_info.id)
    self._mock_service_job_manager.reset_mock()

    # Finish example-validator execution.
    self._finish_node_execution(use_task_queue, example_validator_task)

    # Since transform hasn't finished, trainer will not be triggered yet.
    tasks = self._generate_and_test(
        use_task_queue,
        num_initial_executions=5,
        num_tasks_generated=0 if use_task_queue else 1,
        num_new_executions=0,
        num_active_executions=1,
        expected_exec_nodes=[] if use_task_queue else [self._transform])
    if not use_task_queue:
      transform_task = tasks[0]

    # Finish transform execution.
    self._finish_node_execution(use_task_queue, transform_task)

    # Now all trainer upstream nodes are done, so trainer will be triggered.
    [trainer_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=5,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1,
        expected_exec_nodes=[self._trainer])

    # Finish trainer execution.
    self._finish_node_execution(use_task_queue, trainer_task)

    # Test task-only dependencies: chore_a and chore_b nodes have no input or
    # output specs but should still be executed in the DAG order.
    [chore_a_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=6,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1,
        expected_exec_nodes=[self._chore_a])
    self._finish_node_execution(use_task_queue, chore_a_task)
    [chore_b_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=7,
        num_tasks_generated=1,
        num_new_executions=1,
        num_active_executions=1,
        expected_exec_nodes=[self._chore_b])
    self._finish_node_execution(use_task_queue, chore_b_task)

    # No more components to execute, FinalizePipelineTask should be generated.
    [finalize_task] = self._generate_and_test(
        use_task_queue,
        num_initial_executions=8,
        num_tasks_generated=1,
        num_new_executions=0,
        num_active_executions=0)
    self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
    self.assertEqual(status_lib.Code.OK, finalize_task.status.code)
    if use_task_queue:
      self.assertTrue(self._task_queue.is_empty())
コード例 #10
0
    def test_conditional_execution(self, evaluate):
        """Tests conditionals in the pipeline.

    Args:
      evaluate: Whether to run the conditional evaluator.
    """
        # Check the expected terminal nodes.
        layers = sptg._topsorted_layers(self._pipeline)
        self.assertEqual(
            {
                self._example_validator.node_info.id,
                self._chore_b.node_info.id,
                self._evaluator.node_info.id,
            }, sptg._terminal_node_ids(layers))

        # Start executing the pipeline:

        test_utils.fake_example_gen_run(self._mlmd_connection,
                                        self._example_gen, 1, 1)

        self._run_next(False, expect_nodes=[self._stats_gen])
        self._run_next(False, expect_nodes=[self._schema_gen])
        self._run_next(False,
                       expect_nodes=[self._example_validator, self._transform])

        # Evaluator is run conditionally based on whether the Model artifact
        # produced by the trainer has a custom property evaluate=1.
        self._run_next(
            False,
            expect_nodes=[self._trainer],
            artifact_custom_properties={'evaluate': 1} if evaluate else None)

        tasks = self._generate(False)
        [evaluator_update_node_state_task] = [
            t for t in tasks if task_lib.is_update_node_state_task(t)
            and t.node_uid.node_id == 'my_evaluator'
        ]
        self.assertEqual(
            pstate.NodeState.RUNNING if evaluate else pstate.NodeState.SKIPPED,
            evaluator_update_node_state_task.state)

        exec_node_tasks = [t for t in tasks if task_lib.is_exec_node_task(t)]
        if evaluate:
            [chore_a_exec_node_task,
             evaluator_exec_node_task] = exec_node_tasks
            self.assertEqual('chore_a',
                             chore_a_exec_node_task.node_uid.node_id)
            self.assertEqual('my_evaluator',
                             evaluator_exec_node_task.node_uid.node_id)
            self._finish_node_execution(False, chore_a_exec_node_task)
            self._finish_node_execution(False, evaluator_exec_node_task)
        else:
            [chore_a_exec_node_task] = exec_node_tasks
            self.assertEqual('chore_a',
                             chore_a_exec_node_task.node_uid.node_id)
            self._finish_node_execution(False, chore_a_exec_node_task)

        self._run_next(False, expect_nodes=[self._chore_b])

        # All nodes executed, finalization task should be produced.
        [finalize_task] = self._generate(False, True)
        self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
コード例 #11
0
    def test_pipeline_succeeds_when_terminal_nodes_succeed(
            self, use_task_queue, fail_fast):
        """Tests that pipeline is finalized only after terminal nodes are successful.

    Args:
      use_task_queue: If task queue is enabled, new tasks are only generated if
        a task with the same task_id does not already exist in the queue.
        `use_task_queue=False` is useful to test the case of task generation
        when task queue is empty (for eg: due to orchestrator restart).
      fail_fast: If `True`, pipeline is aborted immediately if any node fails.
    """
        # Check the expected terminal nodes.
        layers = sptg._topsorted_layers(self._pipeline)
        self.assertEqual(
            {
                self._example_validator.node_info.id,
                self._chore_b.node_info.id,
                # evaluator execution will be skipped as it is run conditionally and
                # the condition always evaluates to False in the current test.
                self._evaluator.node_info.id,
            },
            sptg._terminal_node_ids(layers))

        # Start executing the pipeline:

        test_utils.fake_example_gen_run(self._mlmd_connection,
                                        self._example_gen, 1, 1)

        self._run_next(use_task_queue, expect_nodes=[self._stats_gen])
        self._run_next(use_task_queue, expect_nodes=[self._schema_gen])

        # Both example-validator and transform are ready to execute.
        [example_validator_task,
         transform_task] = self._generate(use_task_queue,
                                          True,
                                          fail_fast=fail_fast)
        self.assertEqual(self._example_validator.node_info.id,
                         example_validator_task.node_uid.node_id)
        self.assertEqual(self._transform.node_info.id,
                         transform_task.node_uid.node_id)
        # Start processing (but do not finish) example-validator.
        self._start_processing(use_task_queue, example_validator_task)
        # But finish transform which is in the same layer.
        self._finish_node_execution(use_task_queue, transform_task)

        # Readability note: below, example-validator task should continue to be
        # generated when not using task queue because the execution is active.

        # Trainer and downstream nodes can execute as transform is finished.
        self._run_next(use_task_queue,
                       expect_nodes=[self._trainer] if use_task_queue else
                       [self._example_validator, self._trainer],
                       finish_nodes=[self._trainer],
                       fail_fast=fail_fast)
        self._run_next(use_task_queue,
                       expect_nodes=[self._chore_a] if use_task_queue else
                       [self._example_validator, self._chore_a],
                       finish_nodes=[self._chore_a],
                       fail_fast=fail_fast)
        self._run_next(use_task_queue,
                       expect_nodes=[self._chore_b] if use_task_queue else
                       [self._example_validator, self._chore_b],
                       finish_nodes=[self._chore_b],
                       fail_fast=fail_fast)
        self._run_next(
            use_task_queue,
            expect_nodes=[] if use_task_queue else [self._example_validator],
            finish_nodes=[],
            fail_fast=fail_fast)

        # FinalizePipelineTask is generated only after example-validator finishes.
        test_utils.fake_execute_node(self._mlmd_connection,
                                     example_validator_task)
        self._finish_processing(use_task_queue, example_validator_task)
        [finalize_task] = self._generate(use_task_queue,
                                         True,
                                         fail_fast=fail_fast)
        self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
        self.assertEqual(status_lib.Code.OK, finalize_task.status.code)
コード例 #12
0
  def test_tasks_generated_when_upstream_done(self, use_task_queue):
    """Tests that tasks are generated when upstream is done.

    Args:
      use_task_queue: If task queue is enabled, new tasks are only generated if
        a task with the same task_id does not already exist in the queue.
        `use_task_queue=False` is useful to test the case of task generation
        when task queue is empty (for eg: due to orchestrator restart).
    """
    # Simulate that ExampleGen has already completed successfully.
    otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1)

    def _ensure_node_services(unused_pipeline_state, node_id):
      self.assertEqual(self._example_gen.node_info.id, node_id)
      return service_jobs.ServiceStatus.SUCCESS

    self._mock_service_job_manager.ensure_node_services.side_effect = (
        _ensure_node_services)

    # Generate once.
    with self.subTest(generate=1):
      tasks, active_executions = self._generate_and_test(
          use_task_queue,
          num_initial_executions=1,
          num_tasks_generated=1,
          num_new_executions=1,
          num_active_executions=1)
      self._verify_exec_node_task(self._transform, active_executions[0].id,
                                  tasks[0])

    self._mock_service_job_manager.ensure_node_services.assert_called()

    # Should be fine to regenerate multiple times. There should be no new
    # effects.
    with self.subTest(generate=2):
      tasks, active_executions = self._generate_and_test(
          use_task_queue,
          num_initial_executions=2,
          num_tasks_generated=0 if use_task_queue else 1,
          num_new_executions=0,
          num_active_executions=1)
      if not use_task_queue:
        self._verify_exec_node_task(self._transform, active_executions[0].id,
                                    tasks[0])
    with self.subTest(generate=3):
      tasks, active_executions = self._generate_and_test(
          use_task_queue,
          num_initial_executions=2,
          num_tasks_generated=0 if use_task_queue else 1,
          num_new_executions=0,
          num_active_executions=1)
      execution_id = active_executions[0].id
      if not use_task_queue:
        self._verify_exec_node_task(self._transform, execution_id, tasks[0])

    # Mark transform execution complete.
    otu.fake_transform_output(self._mlmd_connection, self._transform,
                              active_executions[0])
    # Dequeue the corresponding task if task queue is enabled.
    self._dequeue_and_test(use_task_queue, self._transform, execution_id)

    # Trainer execution task should be generated when generate called again.
    with self.subTest(generate=4):
      tasks, active_executions = self._generate_and_test(
          use_task_queue,
          num_initial_executions=2,
          num_tasks_generated=1,
          num_new_executions=1,
          num_active_executions=1)
      execution_id = active_executions[0].id
      self._verify_exec_node_task(self._trainer, execution_id, tasks[0])

    # Mark the trainer execution complete.
    otu.fake_trainer_output(self._mlmd_connection, self._trainer,
                            active_executions[0])
    # Dequeue the corresponding task if task queue is enabled.
    self._dequeue_and_test(use_task_queue, self._trainer, execution_id)

    # No more components to execute, FinalizePipelineTask should be generated.
    with self.subTest(generate=5):
      tasks, _ = self._generate_and_test(
          use_task_queue,
          num_initial_executions=3,
          num_tasks_generated=1,
          num_new_executions=0,
          num_active_executions=0)
    self.assertLen(tasks, 1)
    self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0]))
    self.assertEqual(status_lib.Code.OK, tasks[0].status.code)
    if use_task_queue:
      self.assertTrue(self._task_queue.is_empty())
コード例 #13
0
ファイル: pipeline_ops.py プロジェクト: jay90099/tfx
def _orchestrate_active_pipeline(
        mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue,
        service_job_manager: service_jobs.ServiceJobManager,
        pipeline_state: pstate.PipelineState) -> None:
    """Orchestrates active pipeline."""
    pipeline = pipeline_state.pipeline
    with pipeline_state:
        assert pipeline_state.is_active()
        if pipeline_state.get_pipeline_execution_state() != (
                metadata_store_pb2.Execution.RUNNING):
            pipeline_state.set_pipeline_execution_state(
                metadata_store_pb2.Execution.RUNNING)
        orchestration_options = pipeline_state.get_orchestration_options()
        logging.info('Orchestration options: %s', orchestration_options)
        deadline_secs = orchestration_options.deadline_secs
        if (pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC
                and deadline_secs > 0 and time.time() -
                pipeline_state.pipeline_creation_time_secs_since_epoch() >
                deadline_secs):
            logging.error(
                'Aborting pipeline due to exceeding deadline (%s secs); '
                'pipeline uid: %s', deadline_secs, pipeline_state.pipeline_uid)
            pipeline_state.initiate_stop(
                status_lib.Status(
                    code=status_lib.Code.DEADLINE_EXCEEDED,
                    message=('Pipeline aborted due to exceeding deadline '
                             f'({deadline_secs} secs)')))
            return

    def _filter_by_state(node_infos: List[_NodeInfo],
                         state_str: str) -> List[_NodeInfo]:
        return [n for n in node_infos if n.state.state == state_str]

    node_infos = _get_node_infos(pipeline_state)
    stopping_node_infos = _filter_by_state(node_infos,
                                           pstate.NodeState.STOPPING)

    # Tracks nodes stopped in the current iteration.
    stopped_node_infos: List[_NodeInfo] = []

    # Create cancellation tasks for nodes in state STOPPING.
    for node_info in stopping_node_infos:
        if service_job_manager.is_pure_service_node(
                pipeline_state, node_info.node.node_info.id):
            if service_job_manager.stop_node_services(
                    pipeline_state, node_info.node.node_info.id):
                stopped_node_infos.append(node_info)
        elif _maybe_enqueue_cancellation_task(mlmd_handle, pipeline,
                                              node_info.node, task_queue):
            pass
        elif service_job_manager.is_mixed_service_node(
                pipeline_state, node_info.node.node_info.id):
            if service_job_manager.stop_node_services(
                    pipeline_state, node_info.node.node_info.id):
                stopped_node_infos.append(node_info)
        else:
            stopped_node_infos.append(node_info)

    # Change the state of stopped nodes from STOPPING to STOPPED.
    if stopped_node_infos:
        with pipeline_state:
            for node_info in stopped_node_infos:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    pipeline, node_info.node)
                with pipeline_state.node_state_update_context(
                        node_uid) as node_state:
                    node_state.update(pstate.NodeState.STOPPED,
                                      node_state.status)

    # Initialize task generator for the pipeline.
    if pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC:
        generator = sync_pipeline_task_gen.SyncPipelineTaskGenerator(
            mlmd_handle,
            task_queue.contains_task_id,
            service_job_manager,
            fail_fast=orchestration_options.fail_fast)
    elif pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC:
        generator = async_pipeline_task_gen.AsyncPipelineTaskGenerator(
            mlmd_handle, task_queue.contains_task_id, service_job_manager)
    else:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.FAILED_PRECONDITION,
            message=(
                f'Only SYNC and ASYNC pipeline execution modes supported; '
                f'found pipeline with execution mode: {pipeline.execution_mode}'
            ))

    tasks = generator.generate(pipeline_state)

    with pipeline_state:
        # Handle all the UpdateNodeStateTasks by updating node states.
        for task in tasks:
            if task_lib.is_update_node_state_task(task):
                task = typing.cast(task_lib.UpdateNodeStateTask, task)
                with pipeline_state.node_state_update_context(
                        task.node_uid) as node_state:
                    node_state.update(task.state, task.status)

        tasks = [t for t in tasks if not task_lib.is_update_node_state_task(t)]

        # If there are still nodes in state STARTING, change them to STARTED.
        for node in pstate.get_all_pipeline_nodes(pipeline_state.pipeline):
            node_uid = task_lib.NodeUid.from_pipeline_node(
                pipeline_state.pipeline, node)
            with pipeline_state.node_state_update_context(
                    node_uid) as node_state:
                if node_state.state == pstate.NodeState.STARTING:
                    node_state.update(pstate.NodeState.STARTED)

        for task in tasks:
            if task_lib.is_exec_node_task(task):
                task = typing.cast(task_lib.ExecNodeTask, task)
                task_queue.enqueue(task)
            else:
                assert task_lib.is_finalize_pipeline_task(task)
                assert pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC
                assert len(tasks) == 1
                task = typing.cast(task_lib.FinalizePipelineTask, task)
                if task.status.code == status_lib.Code.OK:
                    logging.info('Pipeline run successful; pipeline uid: %s',
                                 pipeline_state.pipeline_uid)
                else:
                    logging.info('Pipeline run failed; pipeline uid: %s',
                                 pipeline_state.pipeline_uid)
                pipeline_state.initiate_stop(task.status)