def _generate_task(self, node: pipeline_pb2.PipelineNode) -> task_lib.Task: """Generates a node execution task. If node execution is not feasible, `None` is returned. Args: node: The pipeline node for which to generate a task. Returns: Returns an `ExecNodeTask` if node can be executed. If an error occurs, a `FinalizePipelineTask` is returned to abort the pipeline execution. """ executions = task_gen_utils.get_executions(self._mlmd_handle, node) result = task_gen_utils.generate_task_from_active_execution( self._mlmd_handle, self._pipeline, node, executions) if result: return result node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node) resolved_info = task_gen_utils.generate_resolved_info( self._mlmd_handle, node) if resolved_info.input_artifacts is None: return task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to failure to resolve ' f'inputs; problematic node uid: {node_uid}'))) execution = execution_publish_utils.register_execution( metadata_handler=self._mlmd_handle, execution_type=node.node_info.type, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties) outputs_resolver = outputs_utils.OutputsResolver( node, self._pipeline.pipeline_info, self._pipeline.runtime_spec, self._pipeline.execution_mode) return task_lib.ExecNodeTask( node_uid=node_uid, execution=execution, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties, output_artifacts=outputs_resolver.generate_output_artifacts( execution.id), executor_output_uri=outputs_resolver.get_executor_output_uri( execution.id), stateful_working_dir=outputs_resolver. get_stateful_working_directory(execution.id), pipeline=self._pipeline)
def _abort_node_task( self, node_uid: task_lib.NodeUid) -> task_lib.FinalizeNodeTask: """Returns task to abort the node execution.""" logging.error( 'Required service node not running or healthy, node uid: %s', node_uid) return task_lib.FinalizeNodeTask( node_uid=node_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message=(f'Aborting node execution as the associated service ' f'job is not running or healthy; problematic node ' f'uid: {node_uid}')))
def schedule(self) -> ts.TaskSchedulerResult: task = typing.cast(task_lib.ExecNodeTask, self.task) logging.info('Processing ExecNodeTask: %s', task) executor_output = execution_result_pb2.ExecutorOutput() executor_output.execution_result.code = status_lib.Code.OK for key, artifacts in task.output_artifacts.items(): for artifact in artifacts: executor_output.output_artifacts[key].artifacts.add().CopyFrom( artifact.mlmd_artifact) result = ts.TaskSchedulerResult( status=status_lib.Status(code=status_lib.Code.OK), executor_output=executor_output) logging.info('Result: %s', result) return result
def test_stop_initiation(self): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline1') with pstate.PipelineState.new(m, pipeline) as pipeline_state: self.assertIsNone(pipeline_state.stop_initiated_reason()) status = status_lib.Status( code=status_lib.Code.CANCELLED, message='foo bar') pipeline_state.initiate_stop(status) self.assertEqual(status, pipeline_state.stop_initiated_reason()) # Reload from MLMD and verify. with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: self.assertEqual(status, pipeline_state.stop_initiated_reason())
def stop_node( mlmd_handle: metadata.Metadata, node_uid: task_lib.NodeUid, timeout_secs: float = DEFAULT_WAIT_FOR_INACTIVATION_TIMEOUT_SECS) -> None: """Stops a node in an async pipeline. Initiates a node stop operation and waits for the node execution to become inactive. Args: mlmd_handle: A handle to the MLMD db. node_uid: Uid of the node to be stopped. timeout_secs: Amount of time in seconds to wait for node to stop. Raises: status_lib.StatusNotOkError: Failure to stop the node. """ with _PIPELINE_OPS_LOCK: with pstate.PipelineState.load(mlmd_handle, node_uid.pipeline_uid) as pipeline_state: nodes = pstate.get_all_pipeline_nodes(pipeline_state.pipeline) filtered_nodes = [n for n in nodes if n.node_info.id == node_uid.node_id] if len(filtered_nodes) != 1: raise status_lib.StatusNotOkError( code=status_lib.Code.INTERNAL, message=( f'`stop_node` operation failed, unable to find node to stop: ' f'{node_uid}')) node = filtered_nodes[0] pipeline_state.initiate_node_stop( node_uid, status_lib.Status( code=status_lib.Code.CANCELLED, message='Cancellation requested by client.')) executions = task_gen_utils.get_executions(mlmd_handle, node) active_executions = [ e for e in executions if execution_lib.is_execution_active(e) ] if not active_executions: # If there are no active executions, we're done. return if len(active_executions) > 1: raise status_lib.StatusNotOkError( code=status_lib.Code.INTERNAL, message=( f'Unexpected multiple active executions for node: {node_uid}')) _wait_for_inactivation( mlmd_handle, active_executions[0], timeout_secs=timeout_secs)
def test_scheduler_failure(self): # Register a fake task scheduler that returns a failure status. self._register_task_scheduler( ts.TaskSchedulerResult( status=status_lib.Status(code=status_lib.Code.ABORTED), executor_output=None)) task_manager = self._run_task_manager() self.assertTrue(task_manager.done()) self.assertIsNone(task_manager.exception()) # Check that the task was processed and MLMD execution marked failed. self.assertTrue(self._task_queue.is_empty()) execution = self._get_execution() self.assertEqual(metadata_store_pb2.Execution.FAILED, execution.last_known_state)
def test_successful_execution(self): # Register a fake task scheduler that returns a successful execution result # and `OK` task scheduler status. self._register_task_scheduler( ts.TaskSchedulerResult( status=status_lib.Status(code=status_lib.Code.OK), executor_output=_make_executor_output(self._task, code=0))) task_manager = self._run_task_manager() self.assertTrue(task_manager.done()) self.assertIsNone(task_manager.exception()) # Check that the task was processed and MLMD execution marked successful. self.assertTrue(self._task_queue.is_empty()) execution = self._get_execution() self.assertEqual(metadata_store_pb2.Execution.COMPLETE, execution.last_known_state)
def generate(self) -> List[task_lib.Task]: """Generates tasks for all executable nodes in the async pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ result = [] for node in [n.pipeline_node for n in self._pipeline.nodes]: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if node_id in self._ignore_node_ids: logging.info('Ignoring node for task generation: %s', node_uid) continue if self._service_job_manager.is_pure_service_node( self._pipeline_state, node_id): service_status = self._service_job_manager.ensure_node_services( self._pipeline_state, node_id) if service_status != service_jobs.ServiceStatus.RUNNING: logging.error( 'Required service node not running or healthy, node uid: %s', node_uid) result.append( task_lib.FinalizeNodeTask( node_uid=node_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting node execution as the associated service ' f'job is not running or healthy; problematic node ' f'uid: {node_uid}')))) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): continue task = self._generate_task(self._mlmd_handle, node) if task: result.append(task) return result
def test_scheduler_failure(self): # Register a fake task scheduler that returns a failure status. self._register_task_scheduler( ts.TaskSchedulerResult(status=status_lib.Status( code=status_lib.Code.ABORTED, message='foobar error'), executor_output=None)) task_manager = self._run_task_manager() self.assertTrue(task_manager.done()) self.assertIsNone(task_manager.exception()) # Check that the task was processed and MLMD execution marked failed. self.assertTrue(self._task_queue.is_empty()) execution = self._get_execution() self.assertEqual(metadata_store_pb2.Execution.FAILED, execution.last_known_state) self.assertEqual( 'foobar error', data_types_utils.get_metadata_value(execution.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY]))
def node_stop_initiated_reason( self, node_uid: task_lib.NodeUid) -> Optional[status_lib.Status]: """Returns status object if node stop initiated, `None` otherwise.""" if node_uid.pipeline_uid != self.pipeline_uid: raise RuntimeError( f'Node given by uid {node_uid} does not belong to pipeline given ' f'by uid {self.pipeline_uid}') custom_properties = self.execution.custom_properties if _get_metadata_value( custom_properties.get(_node_stop_initiated_property(node_uid))) == 1: code = _get_metadata_value( custom_properties.get(_node_status_code_property(node_uid))) if code is None: code = status_lib.Code.UNKNOWN message = _get_metadata_value( custom_properties.get(_node_status_msg_property(node_uid))) return status_lib.Status(code=code, message=message) else: return None
def _process_exec_node_task(self, scheduler: ts.TaskScheduler, task: task_lib.ExecNodeTask) -> None: """Processes an `ExecNodeTask` using the given task scheduler.""" # This is a blocking call to the scheduler which can take a long time to # complete for some types of task schedulers. The scheduler is expected to # handle any internal errors gracefully and return the result with an error # status. But in case the scheduler raises an exception, it is considered # a failed execution and MLMD is updated accordingly. try: result = scheduler.schedule() except Exception as e: # pylint: disable=broad-except logging.info( 'Exception raised by task scheduler for node uid %s; error: %s', task.node_uid, e) result = ts.TaskSchedulerResult(status=status_lib.Status( code=status_lib.Code.ABORTED, message=str(e))) _publish_execution_results(mlmd_handle=self._mlmd_handle, task=task, result=result) with self._tm_lock: del self._scheduler_by_node_uid[task.node_uid] self._task_queue.task_done(task)
def test_stop_pipeline_non_existent_or_inactive(self, pipeline): with self._mlmd_connection as m: # Stop pipeline without creating one. with self.assertRaises(status_lib.StatusNotOkError) as exception_context: pipeline_ops.stop_pipeline(m, task_lib.PipelineUid.from_pipeline(pipeline)) self.assertEqual(status_lib.Code.NOT_FOUND, exception_context.exception.code) # Initiate pipeline start and mark it completed. execution = pipeline_ops.initiate_pipeline_start(m, pipeline).execution pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state: pipeline_state.initiate_stop(status_lib.Status(code=status_lib.Code.OK)) execution.last_known_state = metadata_store_pb2.Execution.COMPLETE m.store.put_executions([execution]) # Try to initiate stop again. with self.assertRaises(status_lib.StatusNotOkError) as exception_context: pipeline_ops.stop_pipeline(m, pipeline_uid) self.assertEqual(status_lib.Code.NOT_FOUND, exception_context.exception.code)
def test_handling_finalize_node_task(self, task_gen): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline1') pipeline.nodes.add().pipeline_node.node_info.id = 'Transform' pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer' pipeline_ops.initiate_pipeline_start(m, pipeline) pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) finalize_reason = status_lib.Status(code=status_lib.Code.ABORTED, message='foo bar') task_gen.return_value.generate.side_effect = [ [ test_utils.create_exec_node_task( task_lib.NodeUid(pipeline_uid=pipeline_uid, node_id='Transform')), task_lib.FinalizeNodeTask(node_uid=task_lib.NodeUid( pipeline_uid=pipeline_uid, node_id='Trainer'), status=finalize_reason) ], ] task_queue = tq.TaskQueue() pipeline_ops.orchestrate(m, task_queue, service_jobs.DummyServiceJobManager()) task_gen.return_value.generate.assert_called_once() task = task_queue.dequeue() task_queue.task_done(task) self.assertTrue(task_lib.is_exec_node_task(task)) self.assertEqual( test_utils.create_node_uid('pipeline1', 'Transform'), task.node_uid) # Load pipeline state and verify node stop initiation. with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state: self.assertEqual( finalize_reason, pipeline_state.node_stop_initiated_reason( task_lib.NodeUid(pipeline_uid=pipeline_uid, node_id='Trainer')))
def _publish_execution_results(mlmd_handle: metadata.Metadata, task: task_lib.ExecNodeTask, result: ts.TaskSchedulerResult) -> None: """Publishes execution results to MLMD.""" def _update_state(status: status_lib.Status) -> None: assert status.code != status_lib.Code.OK if status.code == status_lib.Code.CANCELLED: logging.info( 'Cancelling execution (id: %s); task id: %s; status: %s', task.execution.id, task.task_id, status) execution_state = metadata_store_pb2.Execution.CANCELED else: logging.info( 'Aborting execution (id: %s) due to error (code: %s); task id: %s', task.execution.id, status.code, task.task_id) execution_state = metadata_store_pb2.Execution.FAILED _update_execution_state_in_mlmd(mlmd_handle, task.execution, execution_state, status.message) if result.status.code != status_lib.Code.OK: _update_state(result.status) return publish_params = dict(output_artifacts=task.output_artifacts) if result.output_artifacts is not None: publish_params['output_artifacts'] = result.output_artifacts elif result.executor_output is not None: if result.executor_output.execution_result.code != status_lib.Code.OK: _update_state( status_lib.Status( code=result.executor_output.execution_result.code, message=result.executor_output.execution_result. result_message)) return publish_params['executor_output'] = result.executor_output execution_publish_utils.publish_succeeded_execution( mlmd_handle, task.execution.id, task.contexts, **publish_params)
def _publish_execution_results(mlmd_handle: metadata.Metadata, task: task_lib.ExecNodeTask, result: ts.TaskSchedulerResult) -> None: """Publishes execution results to MLMD.""" def _update_state(status: status_lib.Status) -> None: assert status.code != status_lib.Code.OK if status.code == status_lib.Code.CANCELLED: execution_state = metadata_store_pb2.Execution.CANCELED state_msg = 'cancelled' else: execution_state = metadata_store_pb2.Execution.FAILED state_msg = 'failed' logging.info( 'Got error (status: %s) for task id: %s; marking execution (id: %s) ' 'as %s.', status, task.task_id, task.execution.id, state_msg) # TODO(goutham): Also record error code and error message as custom property # of the execution. _update_execution_state_in_mlmd(mlmd_handle, task.execution, execution_state) if result.status.code != status_lib.Code.OK: _update_state(result.status) return if (result.executor_output and result.executor_output.execution_result.code != status_lib.Code.OK): _update_state(status_lib.Status( code=result.executor_output.execution_result.code, message=result.executor_output.execution_result.result_message)) return execution_publish_utils.publish_succeeded_execution(mlmd_handle, task.execution.id, task.contexts, task.output_artifacts, result.executor_output)
def test_handling_finalize_pipeline_task(self, task_gen): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline1', pipeline_pb2.Pipeline.SYNC) pipeline_ops.initiate_pipeline_start(m, pipeline) pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) finalize_reason = status_lib.Status( code=status_lib.Code.ABORTED, message='foo bar') task_gen.return_value.generate.side_effect = [ [ task_lib.FinalizePipelineTask( pipeline_uid=pipeline_uid, status=finalize_reason) ], ] task_queue = tq.TaskQueue() pipeline_ops.orchestrate(m, task_queue, service_jobs.DummyServiceJobManager()) task_gen.return_value.generate.assert_called_once() self.assertTrue(task_queue.is_empty()) # Load pipeline state and verify stop initiation. with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state: self.assertEqual(finalize_reason, pipeline_state.stop_initiated_reason())
def test_initiate_node_start_stop(self): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline1') node_uid = task_lib.NodeUid( node_id='Trainer', pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline)) with pstate.PipelineState.new(m, pipeline) as pipeline_state: pipeline_state.initiate_node_start(node_uid) self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid)) # Reload from MLMD and verify node is started. with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid)) # Stop the node. status = status_lib.Status( code=status_lib.Code.ABORTED, message='foo bar') pipeline_state.initiate_node_stop(node_uid, status) self.assertEqual(status, pipeline_state.node_stop_initiated_reason(node_uid)) # Reload from MLMD and verify node is stopped. with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: self.assertEqual(status, pipeline_state.node_stop_initiated_reason(node_uid)) # Restart node. pipeline_state.initiate_node_start(node_uid) self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid)) # Reload from MLMD and verify node is started. with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: self.assertIsNone(pipeline_state.node_stop_initiated_reason(node_uid))
def test_active_pipelines_with_stop_initiated_nodes(self, mock_gen_task_from_active, mock_async_task_gen): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline') pipeline.nodes.add().pipeline_node.node_info.id = 'ExampleGen' pipeline.nodes.add().pipeline_node.node_info.id = 'Transform' pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer' pipeline.nodes.add().pipeline_node.node_info.id = 'Evaluator' mock_service_job_manager = mock.create_autospec( service_jobs.ServiceJobManager, instance=True) mock_service_job_manager.is_pure_service_node.side_effect = ( lambda _, node_id: node_id == 'ExampleGen') example_gen_node_uid = task_lib.NodeUid.from_pipeline_node( pipeline, pipeline.nodes[0].pipeline_node) transform_node_uid = task_lib.NodeUid.from_pipeline_node( pipeline, pipeline.nodes[1].pipeline_node) transform_task = test_utils.create_exec_node_task( node_uid=transform_node_uid) trainer_node_uid = task_lib.NodeUid.from_pipeline_node( pipeline, pipeline.nodes[2].pipeline_node) trainer_task = test_utils.create_exec_node_task(node_uid=trainer_node_uid) evaluator_node_uid = task_lib.NodeUid.from_pipeline_node( pipeline, pipeline.nodes[3].pipeline_node) evaluator_task = test_utils.create_exec_node_task( node_uid=evaluator_node_uid) cancelled_evaluator_task = test_utils.create_exec_node_task( node_uid=evaluator_node_uid, is_cancelled=True) pipeline_ops.initiate_pipeline_start(m, pipeline) with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: # Stop example-gen, trainer and evaluator. pipeline_state.initiate_node_stop( example_gen_node_uid, status_lib.Status(code=status_lib.Code.CANCELLED)) pipeline_state.initiate_node_stop( trainer_node_uid, status_lib.Status(code=status_lib.Code.CANCELLED)) pipeline_state.initiate_node_stop( evaluator_node_uid, status_lib.Status(code=status_lib.Code.ABORTED)) task_queue = tq.TaskQueue() # Simulate a new transform execution being triggered. mock_async_task_gen.return_value.generate.return_value = [transform_task] # Simulate ExecNodeTask for trainer already present in the task queue. task_queue.enqueue(trainer_task) # Simulate Evaluator having an active execution in MLMD. mock_gen_task_from_active.side_effect = [evaluator_task] pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager) self.assertEqual(1, mock_async_task_gen.return_value.generate.call_count) # stop_node_services should be called on example-gen which is a pure # service node. mock_service_job_manager.stop_node_services.assert_called_once_with( mock.ANY, 'ExampleGen') # Verify that tasks are enqueued in the expected order: # Pre-existing trainer task. task = task_queue.dequeue() task_queue.task_done(task) self.assertEqual(trainer_task, task) # CancelNodeTask for trainer. task = task_queue.dequeue() task_queue.task_done(task) self.assertTrue(task_lib.is_cancel_node_task(task)) self.assertEqual(trainer_node_uid, task.node_uid) # ExecNodeTask with is_cancelled=True for evaluator. task = task_queue.dequeue() task_queue.task_done(task) self.assertTrue(cancelled_evaluator_task, task) # ExecNodeTask for newly triggered transform node. task = task_queue.dequeue() task_queue.task_done(task) self.assertEqual(transform_task, task) # No more tasks. self.assertTrue(task_queue.is_empty())
def test_stop_initiated_pipelines(self, pipeline, mock_gen_task_from_active, mock_async_task_gen, mock_sync_task_gen): with self._mlmd_connection as m: pipeline.nodes.add().pipeline_node.node_info.id = 'ExampleGen' pipeline.nodes.add().pipeline_node.node_info.id = 'Transform' pipeline.nodes.add().pipeline_node.node_info.id = 'Trainer' pipeline.nodes.add().pipeline_node.node_info.id = 'Evaluator' mock_service_job_manager = mock.create_autospec( service_jobs.ServiceJobManager, instance=True) mock_service_job_manager.is_pure_service_node.side_effect = ( lambda _, node_id: node_id == 'ExampleGen') mock_service_job_manager.is_mixed_service_node.side_effect = ( lambda _, node_id: node_id == 'Transform') pipeline_ops.initiate_pipeline_start(m, pipeline) with pstate.PipelineState.load( m, task_lib.PipelineUid.from_pipeline(pipeline)) as pipeline_state: pipeline_state.initiate_stop( status_lib.Status(code=status_lib.Code.CANCELLED)) pipeline_execution = pipeline_state.execution task_queue = tq.TaskQueue() # For the stop-initiated pipeline, "Transform" execution task is in queue, # "Trainer" has an active execution in MLMD but no task in queue, # "Evaluator" has no active execution. task_queue.enqueue( test_utils.create_exec_node_task( task_lib.NodeUid( pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline), node_id='Transform'))) transform_task = task_queue.dequeue() # simulates task being processed mock_gen_task_from_active.side_effect = [ test_utils.create_exec_node_task( node_uid=task_lib.NodeUid( pipeline_uid=task_lib.PipelineUid.from_pipeline(pipeline), node_id='Trainer'), is_cancelled=True), None, None, None, None ] pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager) # There are no active pipelines so these shouldn't be called. mock_async_task_gen.assert_not_called() mock_sync_task_gen.assert_not_called() # stop_node_services should be called for ExampleGen which is a pure # service node. mock_service_job_manager.stop_node_services.assert_called_once_with( mock.ANY, 'ExampleGen') mock_service_job_manager.reset_mock() task_queue.task_done(transform_task) # Pop out transform task. # CancelNodeTask for the "Transform" ExecNodeTask should be next. task = task_queue.dequeue() task_queue.task_done(task) self.assertTrue(task_lib.is_cancel_node_task(task)) self.assertEqual('Transform', task.node_uid.node_id) # ExecNodeTask (with is_cancelled=True) for "Trainer" is next. task = task_queue.dequeue() task_queue.task_done(task) self.assertTrue(task_lib.is_exec_node_task(task)) self.assertEqual('Trainer', task.node_uid.node_id) self.assertTrue(task.is_cancelled) self.assertTrue(task_queue.is_empty()) mock_gen_task_from_active.assert_has_calls([ mock.call( m, pipeline_state.pipeline, pipeline.nodes[2].pipeline_node, mock.ANY, is_cancelled=True), mock.call( m, pipeline_state.pipeline, pipeline.nodes[3].pipeline_node, mock.ANY, is_cancelled=True) ]) self.assertEqual(2, mock_gen_task_from_active.call_count) # Pipeline execution should continue to be active since active node # executions were found in the last call to `orchestrate`. [execution] = m.store.get_executions_by_id([pipeline_execution.id]) self.assertTrue(execution_lib.is_execution_active(execution)) # Call `orchestrate` again; this time there are no more active node # executions so the pipeline should be marked as cancelled. pipeline_ops.orchestrate(m, task_queue, mock_service_job_manager) self.assertTrue(task_queue.is_empty()) [execution] = m.store.get_executions_by_id([pipeline_execution.id]) self.assertEqual(metadata_store_pb2.Execution.CANCELED, execution.last_known_state) # stop_node_services should be called on both ExampleGen and Transform # which are service nodes. mock_service_job_manager.stop_node_services.assert_has_calls( [mock.call(mock.ANY, 'ExampleGen'), mock.call(mock.ANY, 'Transform')], any_order=True)
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] successful_node_ids = set() for layer_num, layer_nodes in enumerate(layers): for node in layer_nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._in_successful_nodes_cache(node_uid): successful_node_ids.add(node_id) continue if not self._upstream_nodes_successful(node, successful_node_ids): continue # If this is a pure service node, there is no ExecNodeTask to generate # but we ensure node services and check service status. service_status = self._ensure_node_services_if_pure(node_id) if service_status is not None: if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'service job failed; node uid: {node_uid}') ] if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again but we ensure node services # in case of a mixed service node. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): service_status = self._ensure_node_services_if_mixed( node_id) if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'associated service job failed; node uid: {node_uid}' ) ] continue node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) latest_execution = task_gen_utils.get_latest_execution( node_executions) # If the latest execution is successful, we're done. if latest_execution and execution_lib.is_execution_successful( latest_execution): logging.info('Node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If the latest execution failed, the pipeline should be aborted. if latest_execution and not execution_lib.is_execution_active( latest_execution): error_msg_value = latest_execution.custom_properties.get( constants.EXECUTION_ERROR_MSG_KEY) error_msg = data_types_utils.get_metadata_value( error_msg_value) if error_msg_value else '' return [ self._abort_task( f'node failed; node uid: {node_uid}; error: {error_msg}' ) ] # Finally, we are ready to generate an ExecNodeTask for the node. task = self._maybe_generate_task(node, node_executions, successful_node_ids) if task: if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) layer_node_ids = set(node.node_info.id for node in layer_nodes) successful_layer_node_ids = layer_node_ids & successful_node_ids self._update_successful_nodes_cache(successful_layer_node_ids) # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if (layer_num == len(layers) - 1 and successful_layer_node_ids == layer_node_ids): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] for layer_num, nodes in enumerate(layers): # Boolean that's set if there's at least one successfully executed node # in the current layer. completed_node_ids = set() for node in nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._service_job_manager.is_pure_service_node( self._pipeline_state, node.node_info.id): if not self._upstream_nodes_executed(node): continue service_status = self._service_job_manager.ensure_node_services( self._pipeline_state, node_id) if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node completed successfully: %s', node_uid) completed_node_ids.add(node_id) elif service_status == service_jobs.ServiceStatus.FAILED: logging.error('Failed service node: %s', node_uid) return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to service ' f'node failure; failed node uid: {node_uid}' ))) ] else: logging.info('Pure service node in progress: %s', node_uid) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): continue executions = task_gen_utils.get_executions( self._mlmd_handle, node) if (executions and task_gen_utils.is_latest_execution_successful( executions)): completed_node_ids.add(node_id) continue # If all upstream nodes are executed but current node is not executed, # the node is deemed ready for execution. if self._upstream_nodes_executed(node): task = self._generate_task(node) if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) # If there are no completed nodes in the current layer, downstream nodes # need not be checked. if not completed_node_ids: break # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if layer_num == len(layers) - 1 and completed_node_ids == set( node.node_info.id for node in nodes): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result