def test_pipeline_failure_strategies(self, fail_fast): """Tests pipeline failure strategies.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(False, expect_nodes=[self._stats_gen], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._schema_gen], fail_fast=fail_fast) # Both example-validator and transform are ready to execute. [example_validator_task, transform_task] = self._generate(False, True, fail_fast=fail_fast) self.assertEqual(self._example_validator.node_info.id, example_validator_task.node_uid.node_id) self.assertEqual(self._transform.node_info.id, transform_task.node_uid.node_id) # Simulate Transform success. self._finish_node_execution(False, transform_task) # But fail example-validator. with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, example_validator_task.execution_id) as ev_exec: # Fail stats-gen execution. ev_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( ev_exec.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY], 'example-validator error') if fail_fast: # Pipeline run should immediately fail because example-validator failed. [finalize_task] = self._generate(False, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code) else: # Trainer and downstream nodes can execute as transform has finished. # example-validator failure does not impact them as it is not upstream. # Pipeline run will still fail but when no more progress can be made. self._run_next(False, expect_nodes=[self._trainer], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._chore_a], fail_fast=fail_fast) self._run_next(False, expect_nodes=[self._chore_b], fail_fast=fail_fast) [finalize_task] = self._generate(False, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code)
def test_service_job_failed(self): """Tests task generation when example-gen service job fails.""" def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual('my_example_gen', node_id) return service_jobs.ServiceStatus.FAILED self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) [finalize_task], _ = self._generate_and_test(True, num_initial_executions=0, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code) self.assertRegexMatch(finalize_task.status.message, ['service job failed'])
def test_node_failed(self, use_task_queue): """Tests task generation when a node registers a failed execution.""" otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual(self._example_gen.node_info.id, node_id) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self.assertEqual( task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen), tasks[0].node_uid) stats_gen_exec = active_executions[0] # Fail stats-gen execution. stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( stats_gen_exec.custom_properties[constants.EXECUTION_ERROR_MSG_KEY], 'foobar error') with self._mlmd_connection as m: m.store.put_executions([stats_gen_exec]) if use_task_queue: task = self._task_queue.dequeue() self._task_queue.task_done(task) # Test generation of FinalizePipelineTask. tasks, _ = self._generate_and_test( True, num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.ABORTED, tasks[0].status.code) self.assertRegexMatch(tasks[0].status.message, ['foobar error'])
def test_node_failed(self, fail_fast): """Tests task generation when a node registers a failed execution.""" test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) [stats_gen_task ] = self._generate_and_test(False, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, ignore_update_node_state_tasks=True, fail_fast=fail_fast) self.assertEqual( task_lib.NodeUid.from_pipeline_node(self._pipeline, self._stats_gen), stats_gen_task.node_uid) with self._mlmd_connection as m: with mlmd_state.mlmd_execution_atomic_op( m, stats_gen_task.execution_id) as stats_gen_exec: # Fail stats-gen execution. stats_gen_exec.last_known_state = metadata_store_pb2.Execution.FAILED data_types_utils.set_metadata_value( stats_gen_exec.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY], 'foobar error') # Test generation of FinalizePipelineTask. [update_node_state_task, finalize_task] = self._generate_and_test(True, num_initial_executions=2, num_tasks_generated=2, num_new_executions=0, num_active_executions=0, fail_fast=fail_fast) self.assertTrue( task_lib.is_update_node_state_task(update_node_state_task)) self.assertEqual('my_statistics_gen', update_node_state_task.node_uid.node_id) self.assertEqual(pstate.NodeState.FAILED, update_node_state_task.state) self.assertRegexMatch(update_node_state_task.status.message, ['foobar error']) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.ABORTED, finalize_task.status.code) self.assertRegexMatch(finalize_task.status.message, ['foobar error'])
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertIn( node_id, (self._example_gen.node_info.id, self._transform.node_info.id)) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. Stats-gen task should be generated. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._stats_gen, execution_id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called_with( mock.ANY, self._example_gen.node_info.id) self._mock_service_job_manager.reset_mock() # Finish stats-gen execution. self._finish_node_execution(use_task_queue, self._stats_gen, active_executions[0]) # Schema-gen should execute next. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._schema_gen, execution_id, tasks[0]) # Finish schema-gen execution. self._finish_node_execution(use_task_queue, self._schema_gen, active_executions[0]) # Transform and ExampleValidator should both execute next. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_exec_node_task(self._example_validator, active_executions[0].id, tasks[0]) transform_exec = active_executions[1] self._verify_exec_node_task(self._transform, transform_exec.id, tasks[1]) # Transform is a "mixed service node". self._mock_service_job_manager.ensure_node_services.assert_called_once_with( mock.ANY, self._transform.node_info.id) self._mock_service_job_manager.reset_mock() # Finish example-validator execution. self._finish_node_execution(use_task_queue, self._example_validator, active_executions[0]) # Since transform hasn't finished, trainer will not be triggered yet. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # Finish transform execution. self._finish_node_execution(use_task_queue, self._transform, transform_exec) # Now all trainer upstream nodes are done, so trainer will be triggered. tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._trainer, active_executions[0].id, tasks[0]) # Finish trainer execution. self._finish_node_execution(use_task_queue, self._trainer, active_executions[0]) # No more components to execute, FinalizePipelineTask should be generated. tasks, _ = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.OK, tasks[0].status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] for layer_num, nodes in enumerate(layers): # Boolean that's set if there's at least one successfully executed node # in the current layer. completed_node_ids = set() for node in nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._service_job_manager.is_pure_service_node( self._pipeline_state, node.node_info.id): if not self._upstream_nodes_executed(node): continue service_status = self._service_job_manager.ensure_node_services( self._pipeline_state, node_id) if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node completed successfully: %s', node_uid) completed_node_ids.add(node_id) elif service_status == service_jobs.ServiceStatus.FAILED: logging.error('Failed service node: %s', node_uid) return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to service ' f'node failure; failed node uid: {node_uid}' ))) ] else: logging.info('Pure service node in progress: %s', node_uid) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): continue executions = task_gen_utils.get_executions( self._mlmd_handle, node) if (executions and task_gen_utils.is_latest_execution_successful( executions)): completed_node_ids.add(node_id) continue # If all upstream nodes are executed but current node is not executed, # the node is deemed ready for execution. if self._upstream_nodes_executed(node): task = self._generate_task(node) if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) # If there are no completed nodes in the current layer, downstream nodes # need not be checked. if not completed_node_ids: break # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if layer_num == len(layers) - 1 and completed_node_ids == set( node.node_info.id for node in nodes): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def _orchestrate_active_pipeline( mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue, service_job_manager: service_jobs.ServiceJobManager, pipeline_state: pstate.PipelineState) -> None: """Orchestrates active pipeline.""" pipeline = pipeline_state.pipeline execution = pipeline_state.execution assert execution.last_known_state in (metadata_store_pb2.Execution.NEW, metadata_store_pb2.Execution.RUNNING) if execution.last_known_state != metadata_store_pb2.Execution.RUNNING: updated_execution = copy.deepcopy(execution) updated_execution.last_known_state = metadata_store_pb2.Execution.RUNNING mlmd_handle.store.put_executions([updated_execution]) # Initialize task generator for the pipeline. if pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC: generator = sync_pipeline_task_gen.SyncPipelineTaskGenerator( mlmd_handle, pipeline_state, task_queue.contains_task_id, service_job_manager) elif pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC: # Create cancellation tasks for stop-initiated nodes if necessary. stop_initiated_nodes = _get_stop_initiated_nodes(pipeline_state) for node in stop_initiated_nodes: if service_job_manager.is_pure_service_node( pipeline_state, node.node_info.id): service_job_manager.stop_node_services(pipeline_state, node.node_info.id) elif _maybe_enqueue_cancellation_task(mlmd_handle, pipeline, node, task_queue): pass elif service_job_manager.is_mixed_service_node( pipeline_state, node.node_info.id): service_job_manager.stop_node_services(pipeline_state, node.node_info.id) generator = async_pipeline_task_gen.AsyncPipelineTaskGenerator( mlmd_handle, pipeline_state, task_queue.contains_task_id, service_job_manager, set(n.node_info.id for n in stop_initiated_nodes)) else: raise status_lib.StatusNotOkError( code=status_lib.Code.FAILED_PRECONDITION, message=( f'Only SYNC and ASYNC pipeline execution modes supported; ' f'found pipeline with execution mode: {pipeline.execution_mode}' )) tasks = generator.generate() with pipeline_state: for task in tasks: if task_lib.is_exec_node_task(task): task = typing.cast(task_lib.ExecNodeTask, task) task_queue.enqueue(task) elif task_lib.is_finalize_node_task(task): assert pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC task = typing.cast(task_lib.FinalizeNodeTask, task) pipeline_state.initiate_node_stop(task.node_uid, task.status) else: assert task_lib.is_finalize_pipeline_task(task) assert pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC assert len(tasks) == 1 task = typing.cast(task_lib.FinalizePipelineTask, task) if task.status.code == status_lib.Code.OK: logging.info('Pipeline run successful; pipeline uid: %s', pipeline_state.pipeline_uid) else: logging.info('Pipeline run failed; pipeline uid: %s', pipeline_state.pipeline_uid) pipeline_state.initiate_stop(task.status)
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] successful_node_ids = set() for layer_num, layer_nodes in enumerate(layers): for node in layer_nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._in_successful_nodes_cache(node_uid): successful_node_ids.add(node_id) continue if not self._upstream_nodes_successful(node, successful_node_ids): continue # If this is a pure service node, there is no ExecNodeTask to generate # but we ensure node services and check service status. service_status = self._ensure_node_services_if_pure(node_id) if service_status is not None: if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'service job failed; node uid: {node_uid}') ] if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again but we ensure node services # in case of a mixed service node. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): service_status = self._ensure_node_services_if_mixed( node_id) if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'associated service job failed; node uid: {node_uid}' ) ] continue node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) latest_execution = task_gen_utils.get_latest_execution( node_executions) # If the latest execution is successful, we're done. if latest_execution and execution_lib.is_execution_successful( latest_execution): logging.info('Node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If the latest execution failed, the pipeline should be aborted. if latest_execution and not execution_lib.is_execution_active( latest_execution): error_msg_value = latest_execution.custom_properties.get( constants.EXECUTION_ERROR_MSG_KEY) error_msg = data_types_utils.get_metadata_value( error_msg_value) if error_msg_value else '' return [ self._abort_task( f'node failed; node uid: {node_uid}; error: {error_msg}' ) ] # Finally, we are ready to generate an ExecNodeTask for the node. task = self._maybe_generate_task(node, node_executions, successful_node_ids) if task: if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) layer_node_ids = set(node.node_info.id for node in layer_nodes) successful_layer_node_ids = layer_node_ids & successful_node_ids self._update_successful_nodes_cache(successful_layer_node_ids) # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if (layer_num == len(layers) - 1 and successful_layer_node_ids == layer_node_ids): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Generate once. Stats-gen task should be generated. [stats_gen_task] = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._stats_gen]) self._mock_service_job_manager.ensure_node_services.assert_called_with( mock.ANY, self._example_gen.node_info.id) self._mock_service_job_manager.reset_mock() # Finish stats-gen execution. self._finish_node_execution(use_task_queue, stats_gen_task) # Schema-gen should execute next. [schema_gen_task] = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._schema_gen]) # Finish schema-gen execution. self._finish_node_execution(use_task_queue, schema_gen_task) # Transform and ExampleValidator should both execute next. [example_validator_task, transform_task] = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=2, num_new_executions=2, num_active_executions=2, expected_exec_nodes=[self._example_validator, self._transform]) # Transform is a "mixed service node". self._mock_service_job_manager.ensure_node_services.assert_called_once_with( mock.ANY, self._transform.node_info.id) self._mock_service_job_manager.reset_mock() # Finish example-validator execution. self._finish_node_execution(use_task_queue, example_validator_task) # Since transform hasn't finished, trainer will not be triggered yet. tasks = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1, expected_exec_nodes=[] if use_task_queue else [self._transform]) if not use_task_queue: transform_task = tasks[0] # Finish transform execution. self._finish_node_execution(use_task_queue, transform_task) # Now all trainer upstream nodes are done, so trainer will be triggered. [trainer_task] = self._generate_and_test( use_task_queue, num_initial_executions=5, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._trainer]) # Finish trainer execution. self._finish_node_execution(use_task_queue, trainer_task) # Test task-only dependencies: chore_a and chore_b nodes have no input or # output specs but should still be executed in the DAG order. [chore_a_task] = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._chore_a]) self._finish_node_execution(use_task_queue, chore_a_task) [chore_b_task] = self._generate_and_test( use_task_queue, num_initial_executions=7, num_tasks_generated=1, num_new_executions=1, num_active_executions=1, expected_exec_nodes=[self._chore_b]) self._finish_node_execution(use_task_queue, chore_b_task) # No more components to execute, FinalizePipelineTask should be generated. [finalize_task] = self._generate_and_test( use_task_queue, num_initial_executions=8, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.OK, finalize_task.status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_conditional_execution(self, evaluate): """Tests conditionals in the pipeline. Args: evaluate: Whether to run the conditional evaluator. """ # Check the expected terminal nodes. layers = sptg._topsorted_layers(self._pipeline) self.assertEqual( { self._example_validator.node_info.id, self._chore_b.node_info.id, self._evaluator.node_info.id, }, sptg._terminal_node_ids(layers)) # Start executing the pipeline: test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(False, expect_nodes=[self._stats_gen]) self._run_next(False, expect_nodes=[self._schema_gen]) self._run_next(False, expect_nodes=[self._example_validator, self._transform]) # Evaluator is run conditionally based on whether the Model artifact # produced by the trainer has a custom property evaluate=1. self._run_next( False, expect_nodes=[self._trainer], artifact_custom_properties={'evaluate': 1} if evaluate else None) tasks = self._generate(False) [evaluator_update_node_state_task] = [ t for t in tasks if task_lib.is_update_node_state_task(t) and t.node_uid.node_id == 'my_evaluator' ] self.assertEqual( pstate.NodeState.RUNNING if evaluate else pstate.NodeState.SKIPPED, evaluator_update_node_state_task.state) exec_node_tasks = [t for t in tasks if task_lib.is_exec_node_task(t)] if evaluate: [chore_a_exec_node_task, evaluator_exec_node_task] = exec_node_tasks self.assertEqual('chore_a', chore_a_exec_node_task.node_uid.node_id) self.assertEqual('my_evaluator', evaluator_exec_node_task.node_uid.node_id) self._finish_node_execution(False, chore_a_exec_node_task) self._finish_node_execution(False, evaluator_exec_node_task) else: [chore_a_exec_node_task] = exec_node_tasks self.assertEqual('chore_a', chore_a_exec_node_task.node_uid.node_id) self._finish_node_execution(False, chore_a_exec_node_task) self._run_next(False, expect_nodes=[self._chore_b]) # All nodes executed, finalization task should be produced. [finalize_task] = self._generate(False, True) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task))
def test_pipeline_succeeds_when_terminal_nodes_succeed( self, use_task_queue, fail_fast): """Tests that pipeline is finalized only after terminal nodes are successful. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). fail_fast: If `True`, pipeline is aborted immediately if any node fails. """ # Check the expected terminal nodes. layers = sptg._topsorted_layers(self._pipeline) self.assertEqual( { self._example_validator.node_info.id, self._chore_b.node_info.id, # evaluator execution will be skipped as it is run conditionally and # the condition always evaluates to False in the current test. self._evaluator.node_info.id, }, sptg._terminal_node_ids(layers)) # Start executing the pipeline: test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) self._run_next(use_task_queue, expect_nodes=[self._stats_gen]) self._run_next(use_task_queue, expect_nodes=[self._schema_gen]) # Both example-validator and transform are ready to execute. [example_validator_task, transform_task] = self._generate(use_task_queue, True, fail_fast=fail_fast) self.assertEqual(self._example_validator.node_info.id, example_validator_task.node_uid.node_id) self.assertEqual(self._transform.node_info.id, transform_task.node_uid.node_id) # Start processing (but do not finish) example-validator. self._start_processing(use_task_queue, example_validator_task) # But finish transform which is in the same layer. self._finish_node_execution(use_task_queue, transform_task) # Readability note: below, example-validator task should continue to be # generated when not using task queue because the execution is active. # Trainer and downstream nodes can execute as transform is finished. self._run_next(use_task_queue, expect_nodes=[self._trainer] if use_task_queue else [self._example_validator, self._trainer], finish_nodes=[self._trainer], fail_fast=fail_fast) self._run_next(use_task_queue, expect_nodes=[self._chore_a] if use_task_queue else [self._example_validator, self._chore_a], finish_nodes=[self._chore_a], fail_fast=fail_fast) self._run_next(use_task_queue, expect_nodes=[self._chore_b] if use_task_queue else [self._example_validator, self._chore_b], finish_nodes=[self._chore_b], fail_fast=fail_fast) self._run_next( use_task_queue, expect_nodes=[] if use_task_queue else [self._example_validator], finish_nodes=[], fail_fast=fail_fast) # FinalizePipelineTask is generated only after example-validator finishes. test_utils.fake_execute_node(self._mlmd_connection, example_validator_task) self._finish_processing(use_task_queue, example_validator_task) [finalize_task] = self._generate(use_task_queue, True, fail_fast=fail_fast) self.assertTrue(task_lib.is_finalize_pipeline_task(finalize_task)) self.assertEqual(status_lib.Code.OK, finalize_task.status.code)
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual(self._example_gen.node_info.id, node_id) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called() # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, execution_id) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more components to execute, FinalizePipelineTask should be generated. with self.subTest(generate=5): tasks, _ = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.OK, tasks[0].status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def _orchestrate_active_pipeline( mlmd_handle: metadata.Metadata, task_queue: tq.TaskQueue, service_job_manager: service_jobs.ServiceJobManager, pipeline_state: pstate.PipelineState) -> None: """Orchestrates active pipeline.""" pipeline = pipeline_state.pipeline with pipeline_state: assert pipeline_state.is_active() if pipeline_state.get_pipeline_execution_state() != ( metadata_store_pb2.Execution.RUNNING): pipeline_state.set_pipeline_execution_state( metadata_store_pb2.Execution.RUNNING) orchestration_options = pipeline_state.get_orchestration_options() logging.info('Orchestration options: %s', orchestration_options) deadline_secs = orchestration_options.deadline_secs if (pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC and deadline_secs > 0 and time.time() - pipeline_state.pipeline_creation_time_secs_since_epoch() > deadline_secs): logging.error( 'Aborting pipeline due to exceeding deadline (%s secs); ' 'pipeline uid: %s', deadline_secs, pipeline_state.pipeline_uid) pipeline_state.initiate_stop( status_lib.Status( code=status_lib.Code.DEADLINE_EXCEEDED, message=('Pipeline aborted due to exceeding deadline ' f'({deadline_secs} secs)'))) return def _filter_by_state(node_infos: List[_NodeInfo], state_str: str) -> List[_NodeInfo]: return [n for n in node_infos if n.state.state == state_str] node_infos = _get_node_infos(pipeline_state) stopping_node_infos = _filter_by_state(node_infos, pstate.NodeState.STOPPING) # Tracks nodes stopped in the current iteration. stopped_node_infos: List[_NodeInfo] = [] # Create cancellation tasks for nodes in state STOPPING. for node_info in stopping_node_infos: if service_job_manager.is_pure_service_node( pipeline_state, node_info.node.node_info.id): if service_job_manager.stop_node_services( pipeline_state, node_info.node.node_info.id): stopped_node_infos.append(node_info) elif _maybe_enqueue_cancellation_task(mlmd_handle, pipeline, node_info.node, task_queue): pass elif service_job_manager.is_mixed_service_node( pipeline_state, node_info.node.node_info.id): if service_job_manager.stop_node_services( pipeline_state, node_info.node.node_info.id): stopped_node_infos.append(node_info) else: stopped_node_infos.append(node_info) # Change the state of stopped nodes from STOPPING to STOPPED. if stopped_node_infos: with pipeline_state: for node_info in stopped_node_infos: node_uid = task_lib.NodeUid.from_pipeline_node( pipeline, node_info.node) with pipeline_state.node_state_update_context( node_uid) as node_state: node_state.update(pstate.NodeState.STOPPED, node_state.status) # Initialize task generator for the pipeline. if pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC: generator = sync_pipeline_task_gen.SyncPipelineTaskGenerator( mlmd_handle, task_queue.contains_task_id, service_job_manager, fail_fast=orchestration_options.fail_fast) elif pipeline.execution_mode == pipeline_pb2.Pipeline.ASYNC: generator = async_pipeline_task_gen.AsyncPipelineTaskGenerator( mlmd_handle, task_queue.contains_task_id, service_job_manager) else: raise status_lib.StatusNotOkError( code=status_lib.Code.FAILED_PRECONDITION, message=( f'Only SYNC and ASYNC pipeline execution modes supported; ' f'found pipeline with execution mode: {pipeline.execution_mode}' )) tasks = generator.generate(pipeline_state) with pipeline_state: # Handle all the UpdateNodeStateTasks by updating node states. for task in tasks: if task_lib.is_update_node_state_task(task): task = typing.cast(task_lib.UpdateNodeStateTask, task) with pipeline_state.node_state_update_context( task.node_uid) as node_state: node_state.update(task.state, task.status) tasks = [t for t in tasks if not task_lib.is_update_node_state_task(t)] # If there are still nodes in state STARTING, change them to STARTED. for node in pstate.get_all_pipeline_nodes(pipeline_state.pipeline): node_uid = task_lib.NodeUid.from_pipeline_node( pipeline_state.pipeline, node) with pipeline_state.node_state_update_context( node_uid) as node_state: if node_state.state == pstate.NodeState.STARTING: node_state.update(pstate.NodeState.STARTED) for task in tasks: if task_lib.is_exec_node_task(task): task = typing.cast(task_lib.ExecNodeTask, task) task_queue.enqueue(task) else: assert task_lib.is_finalize_pipeline_task(task) assert pipeline.execution_mode == pipeline_pb2.Pipeline.SYNC assert len(tasks) == 1 task = typing.cast(task_lib.FinalizePipelineTask, task) if task.status.code == status_lib.Code.OK: logging.info('Pipeline run successful; pipeline uid: %s', pipeline_state.pipeline_uid) else: logging.info('Pipeline run failed; pipeline uid: %s', pipeline_state.pipeline_uid) pipeline_state.initiate_stop(task.status)