Exemple #1
0
    def test_stop_initiated_async_pipelines(self, mock_gen_task_from_active,
                                            mock_async_task_gen,
                                            mock_sync_task_gen):
        with self._mlmd_connection as m:
            pipeline1 = _test_pipeline('pipeline1')
            pipeline1.nodes.add().pipeline_node.node_info.id = 'Transform'
            pipeline1.nodes.add().pipeline_node.node_info.id = 'Trainer'
            pipeline1.nodes.add().pipeline_node.node_info.id = 'Evaluator'
            pipeline_ops.initiate_pipeline_start(m, pipeline1)
            pipeline1_execution = pipeline_ops._initiate_pipeline_stop(
                m, task_lib.PipelineUid.from_pipeline(pipeline1))

            task_queue = tq.TaskQueue()

            # For the stop-initiated pipeline, "Transform" execution task is in queue,
            # "Trainer" has an active execution in MLMD but no task in queue,
            # "Evaluator" has no active execution.
            task_queue.enqueue(
                test_utils.create_exec_node_task(node_uid=task_lib.NodeUid(
                    pipeline_uid=task_lib.PipelineUid(pipeline_id='pipeline1',
                                                      pipeline_run_id=None),
                    node_id='Transform')))
            transform_task = task_queue.dequeue(
            )  # simulates task being processed
            mock_gen_task_from_active.side_effect = [
                test_utils.create_exec_node_task(node_uid=task_lib.NodeUid(
                    pipeline_uid=task_lib.PipelineUid(pipeline_id='pipeline1',
                                                      pipeline_run_id=None),
                    node_id='Trainer'),
                                                 is_cancelled=True), None,
                None, None, None
            ]

            pipeline_ops.generate_tasks(m, task_queue)

            # There are no active pipelines so these shouldn't be called.
            mock_async_task_gen.assert_not_called()
            mock_sync_task_gen.assert_not_called()

            # Simulate finishing the "Transform" ExecNodeTask.
            task_queue.task_done(transform_task)

            # CancelNodeTask for the "Transform" ExecNodeTask should be next.
            task = task_queue.dequeue()
            task_queue.task_done(task)
            self.assertTrue(task_lib.is_cancel_node_task(task))
            self.assertEqual('Transform', task.node_uid.node_id)

            # ExecNodeTask for "Trainer" is next.
            task = task_queue.dequeue()
            task_queue.task_done(task)
            self.assertTrue(task_lib.is_exec_node_task(task))
            self.assertEqual('Trainer', task.node_uid.node_id)

            self.assertTrue(task_queue.is_empty())

            mock_gen_task_from_active.assert_has_calls([
                mock.call(m,
                          pipeline1,
                          pipeline1.nodes[1].pipeline_node,
                          mock.ANY,
                          is_cancelled=True),
                mock.call(m,
                          pipeline1,
                          pipeline1.nodes[2].pipeline_node,
                          mock.ANY,
                          is_cancelled=True)
            ])
            self.assertEqual(2, mock_gen_task_from_active.call_count)

            # Pipeline execution should continue to be active since active node
            # executions were found in the last call to `generate_tasks`.
            [execution
             ] = m.store.get_executions_by_id([pipeline1_execution.id])
            self.assertTrue(execution_lib.is_execution_active(execution))

            # Call `generate_tasks` again; this time there are no more active node
            # executions so the pipeline should be marked as cancelled.
            pipeline_ops.generate_tasks(m, task_queue)
            self.assertTrue(task_queue.is_empty())
            [execution
             ] = m.store.get_executions_by_id([pipeline1_execution.id])
            self.assertEqual(metadata_store_pb2.Execution.CANCELED,
                             execution.last_known_state)
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
Exemple #3
0
 def _is_inactivated() -> bool:
     [execution] = mlmd_handle.store.get_executions_by_id([execution_id])
     return not execution_lib.is_execution_active(execution)
Exemple #4
0
def resume_pipeline(mlmd_handle: metadata.Metadata,
                    pipeline: pipeline_pb2.Pipeline) -> pstate.PipelineState:
    """Resumes a pipeline run from previously failed nodes.

  Upon success, MLMD is updated to signal that the pipeline must be started.

  Args:
    mlmd_handle: A handle to the MLMD db.
    pipeline: IR of the pipeline to resume.

  Returns:
    The `PipelineState` object upon success.

  Raises:
    status_lib.StatusNotOkError: Failure to resume pipeline. With code
      `ALREADY_EXISTS` if a pipeline is already running. With code
      `status_lib.Code.FAILED_PRECONDITION` if a previous pipeline run
      is not found for resuming.
  """

    logging.info('Received request to resume pipeline; pipeline uid: %s',
                 task_lib.PipelineUid.from_pipeline(pipeline))
    if pipeline.execution_mode != pipeline_pb2.Pipeline.SYNC:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.FAILED_PRECONDITION,
            message=(
                f'Only SYNC pipeline execution modes supported; '
                f'found pipeline with execution mode: {pipeline.execution_mode}'
            ))

    latest_pipeline_view = None
    pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline)
    views = pstate.PipelineView.load_all(mlmd_handle, pipeline_uid)
    for view in views:
        execution = view.execution
        if execution_lib.is_execution_active(execution):
            raise status_lib.StatusNotOkError(
                code=status_lib.Code.ALREADY_EXISTS,
                message=(
                    f'Can not resume pipeline. An active pipeline is already '
                    f'running with uid {pipeline_uid}.'))
        if (not latest_pipeline_view or execution.create_time_since_epoch >
                latest_pipeline_view.execution.create_time_since_epoch):
            latest_pipeline_view = view

    if not latest_pipeline_view:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.NOT_FOUND,
            message='Pipeline failed to resume. No previous pipeline run found.'
        )
    if latest_pipeline_view.pipeline.execution_mode != pipeline_pb2.Pipeline.SYNC:
        raise status_lib.StatusNotOkError(
            code=status_lib.Code.FAILED_PRECONDITION,
            message=
            (f'Only SYNC pipeline execution modes supported; previous pipeline '
             f'run has execution mode: '
             f'{latest_pipeline_view.pipeline.execution_mode}'))

    # Get succeeded nodes in latest pipeline run.
    latest_pipeline_node_states = latest_pipeline_view.get_node_states_dict()
    previously_succeeded_nodes = []
    for node, node_state in latest_pipeline_node_states.items():
        if node_state.is_success():
            previously_succeeded_nodes.append(node)
    pipeline_nodes = [
        node.node_info.id for node in pstate.get_all_pipeline_nodes(pipeline)
    ]
    latest_pipeline_snapshot_settings = pipeline_pb2.SnapshotSettings()
    latest_pipeline_snapshot_settings.latest_pipeline_run_strategy.SetInParent(
    )
    partial_run_option = pipeline_pb2.PartialRun(
        from_nodes=pipeline_nodes,
        to_nodes=pipeline_nodes,
        skip_nodes=previously_succeeded_nodes,
        snapshot_settings=latest_pipeline_snapshot_settings)

    return initiate_pipeline_start(mlmd_handle,
                                   pipeline,
                                   partial_run_option=partial_run_option)
Exemple #5
0
    def test_stop_initiated_pipelines(self, pipeline,
                                      mock_gen_task_from_active,
                                      mock_async_task_gen, mock_sync_task_gen):
        with self._mlmd_connection as m:
            pipeline.nodes.add().pipeline_node.node_info.id = 'ExampleGen'
            pipeline.nodes.add().pipeline_node.node_info.id = 'Transform'
            pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer'
            pipeline.nodes.add().pipeline_node.node_info.id = 'Evaluator'

            mock_service_job_manager = mock.create_autospec(
                service_jobs.ServiceJobManager, instance=True)
            mock_service_job_manager.is_pure_service_node.side_effect = (
                lambda _, node_id: node_id == 'ExampleGen')
            mock_service_job_manager.is_mixed_service_node.side_effect = (
                lambda _, node_id: node_id == 'Transform')

            pipeline_ops.initiate_pipeline_start(m, pipeline)
            with pstate.PipelineState.load(
                    m, task_lib.PipelineUid.from_pipeline(
                        pipeline)) as pipeline_state:
                pipeline_state.initiate_stop(
                    status_lib.Status(code=status_lib.Code.CANCELLED))
                pipeline_execution_id = pipeline_state.execution_id

            task_queue = tq.TaskQueue()

            # For the stop-initiated pipeline, "Transform" execution task is in queue,
            # "Trainer" has an active execution in MLMD but no task in queue,
            # "Evaluator" has no active execution.
            task_queue.enqueue(
                test_utils.create_exec_node_task(
                    task_lib.NodeUid(
                        pipeline_uid=task_lib.PipelineUid.from_pipeline(
                            pipeline),
                        node_id='Transform')))
            transform_task = task_queue.dequeue(
            )  # simulates task being processed
            mock_gen_task_from_active.side_effect = [
                test_utils.create_exec_node_task(node_uid=task_lib.NodeUid(
                    pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline),
                    node_id='Trainer'),
                                                 is_cancelled=True), None,
                None, None, None
            ]

            pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager)

            # There are no active pipelines so these shouldn't be called.
            mock_async_task_gen.assert_not_called()
            mock_sync_task_gen.assert_not_called()

            # stop_node_services should be called for ExampleGen which is a pure
            # service node.
            mock_service_job_manager.stop_node_services.assert_called_once_with(
                mock.ANY, 'ExampleGen')
            mock_service_job_manager.reset_mock()

            task_queue.task_done(transform_task)  # Pop out transform task.

            # CancelNodeTask for the "Transform" ExecNodeTask should be next.
            task = task_queue.dequeue()
            task_queue.task_done(task)
            self.assertTrue(task_lib.is_cancel_node_task(task))
            self.assertEqual('Transform', task.node_uid.node_id)

            # ExecNodeTask (with is_cancelled=True) for "Trainer" is next.
            task = task_queue.dequeue()
            task_queue.task_done(task)
            self.assertTrue(task_lib.is_exec_node_task(task))
            self.assertEqual('Trainer', task.node_uid.node_id)
            self.assertTrue(task.is_cancelled)

            self.assertTrue(task_queue.is_empty())

            mock_gen_task_from_active.assert_has_calls([
                mock.call(m,
                          pipeline_state.pipeline,
                          pipeline.nodes[2].pipeline_node,
                          mock.ANY,
                          is_cancelled=True),
                mock.call(m,
                          pipeline_state.pipeline,
                          pipeline.nodes[3].pipeline_node,
                          mock.ANY,
                          is_cancelled=True)
            ])
            self.assertEqual(2, mock_gen_task_from_active.call_count)

            # Pipeline execution should continue to be active since active node
            # executions were found in the last call to `orchestrate`.
            [execution] = m.store.get_executions_by_id([pipeline_execution_id])
            self.assertTrue(execution_lib.is_execution_active(execution))

            # Call `orchestrate` again; this time there are no more active node
            # executions so the pipeline should be marked as cancelled.
            pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager)
            self.assertTrue(task_queue.is_empty())
            [execution] = m.store.get_executions_by_id([pipeline_execution_id])
            self.assertEqual(metadata_store_pb2.Execution.CANCELED,
                             execution.last_known_state)

            # stop_node_services should be called on both ExampleGen and Transform
            # which are service nodes.
            mock_service_job_manager.stop_node_services.assert_has_calls(
                [
                    mock.call(mock.ANY, 'ExampleGen'),
                    mock.call(mock.ANY, 'Transform')
                ],
                any_order=True)