def test_stop_node_wait_for_inactivation_timeout(self): pipeline = pipeline_pb2.Pipeline() self.load_proto_from_text( os.path.join(os.path.dirname(__file__), 'testdata', 'async_pipeline.pbtxt'), pipeline) trainer = pipeline.nodes[2].pipeline_node test_utils.fake_trainer_output(self._mlmd_connection, trainer, active=True) pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) node_uid = task_lib.NodeUid(node_id='my_trainer', pipeline_uid=pipeline_uid) with self._mlmd_connection as m: pstate.PipelineState.new(m, pipeline).commit() with self.assertRaisesRegex( status_lib.StatusNotOkError, 'Timed out.*waiting for execution inactivation.' ) as exception_context: pipeline_ops.stop_node(m, node_uid, timeout_secs=1.0) self.assertEqual(status_lib.Code.DEADLINE_EXCEEDED, exception_context.exception.code) # Even if `wait_for_inactivation` times out, the node should be stop # initiated to prevent future triggers. pipeline_state = pstate.PipelineState.load(m, pipeline_uid) self.assertEqual( status_lib.Code.CANCELLED, pipeline_state.node_stop_initiated_reason(node_uid).code)
def test_stop_node_wait_for_inactivation(self): pipeline = pipeline_pb2.Pipeline() self.load_proto_from_text( os.path.join(os.path.dirname(__file__), 'testdata', 'async_pipeline.pbtxt'), pipeline) trainer = pipeline.nodes[2].pipeline_node test_utils.fake_trainer_output(self._mlmd_connection, trainer, active=True) pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) node_uid = task_lib.NodeUid(node_id='my_trainer', pipeline_uid=pipeline_uid) with self._mlmd_connection as m: pstate.PipelineState.new(m, pipeline).commit() def _inactivate(execution): time.sleep(2.0) with pipeline_ops._PIPELINE_OPS_LOCK: execution.last_known_state = metadata_store_pb2.Execution.COMPLETE m.store.put_executions([execution]) execution = task_gen_utils.get_executions(m, trainer)[0] thread = threading.Thread(target=_inactivate, args=(copy.deepcopy(execution), )) thread.start() pipeline_ops.stop_node(m, node_uid, timeout_secs=5.0) thread.join() pipeline_state = pstate.PipelineState.load(m, pipeline_uid) self.assertTrue(pipeline_state.is_node_stop_initiated(node_uid)) # Restart node. pipeline_state = pipeline_ops.initiate_node_start(m, node_uid) self.assertFalse(pipeline_state.is_node_stop_initiated(node_uid))
def test_generate_task_from_active_execution(self): with self._mlmd_connection as m: # No tasks generated without active execution. executions = task_gen_utils.get_executions(m, self._trainer) self.assertIsNone( task_gen_utils.generate_task_from_active_execution( m, self._pipeline, self._trainer, executions)) # Next, ensure an active execution for trainer. otu.fake_trainer_output(self._mlmd_connection, self._trainer) with self._mlmd_connection as m: execution = m.store.get_executions()[0] execution.last_known_state = metadata_store_pb2.Execution.RUNNING m.store.put_executions([execution]) # Check that task can be generated. executions = task_gen_utils.get_executions(m, self._trainer) task = task_gen_utils.generate_task_from_active_execution( m, self._pipeline, self._trainer, executions) self.assertEqual(execution.id, task.execution.id) # Mark execution complete. No tasks should be generated. execution = m.store.get_executions()[0] execution.last_known_state = metadata_store_pb2.Execution.COMPLETE m.store.put_executions([execution]) executions = task_gen_utils.get_executions(m, self._trainer) self.assertIsNone( task_gen_utils.generate_task_from_active_execution( m, self._pipeline, self._trainer, executions))
def test_task_generation(self, use_task_queue): """Tests async pipeline task generation. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # No new effects if generate called again. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, active_executions[0].id) # Trainer execution task should be generated next. with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more tasks should be generated as there are no new inputs. with self.subTest(generate=4): self._generate_and_test(use_task_queue, num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) # Fake another ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=4, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Re-generation will produce the same tasks when task queue enabled. with self.subTest(generate=5): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=0 if use_task_queue else 2, num_new_executions=0, num_active_executions=2) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._verify_exec_node_task(self._trainer, active_executions[1].id, tasks[1]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task. self._dequeue_and_test(use_task_queue, self._transform, active_executions[0].id) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[1]) self._dequeue_and_test(use_task_queue, self._trainer, active_executions[1].id) # Trainer should be triggered again due to transform producing new output. with self.subTest(generate=6): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._trainer, active_executions[0].id, tasks[0]) # Finally, no new tasks once trainer completes. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue corresponding task. self._dequeue_and_test(use_task_queue, self._trainer, active_executions[0].id) with self.subTest(generate=7): self._generate_and_test(use_task_queue, num_initial_executions=7, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, execution_id) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more components to execute so no tasks are generated. with self.subTest(generate=5): self._generate_and_test(use_task_queue, num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_task_generation(self): # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # No new effects if generate called again. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Trainer execution task should be generated next. with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # No more tasks should be generated as there are no new inputs. with self.subTest(generate=4): self._generate_and_test(num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0) # Fake another ExampleGen run. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Both transform and trainer tasks should be generated as they both find # new inputs. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( num_initial_executions=4, num_tasks_generated=2, num_new_executions=2, num_active_executions=2) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) self._verify_node_execution_task(self._trainer, active_executions[1], tasks[1]) # Re-generation will produce the same tasks again. with self.subTest(generate=5): tasks, active_executions = self._generate_and_test( num_initial_executions=6, num_tasks_generated=2, num_new_executions=0, num_active_executions=2) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) self._verify_node_execution_task(self._trainer, active_executions[1], tasks[1]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[1]) # Trainer should be triggered again due to transform producing new output. with self.subTest(generate=6): tasks, active_executions = self._generate_and_test( num_initial_executions=6, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Finally, no new tasks once trainer completes. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) with self.subTest(generate=7): self._generate_and_test(num_initial_executions=7, num_tasks_generated=0, num_new_executions=0, num_active_executions=0)
def test_tasks_generated_when_upstream_done(self, use_task_queue): """Tests that tasks are generated when upstream is done. Args: use_task_queue: If task queue is enabled, new tasks are only generated if a task with the same task_id does not already exist in the queue. `use_task_queue=False` is useful to test the case of task generation when task queue is empty (for eg: due to orchestrator restart). """ # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) def _ensure_node_services(unused_pipeline_state, node_id): self.assertEqual(self._example_gen.node_info.id, node_id) return service_jobs.ServiceStatus.SUCCESS self._mock_service_job_manager.ensure_node_services.side_effect = ( _ensure_node_services) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) self._mock_service_job_manager.ensure_node_services.assert_called() # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) if not use_task_queue: self._verify_exec_node_task(self._transform, active_executions[0].id, tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=0 if use_task_queue else 1, num_new_executions=0, num_active_executions=1) execution_id = active_executions[0].id if not use_task_queue: self._verify_exec_node_task(self._transform, execution_id, tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._transform, execution_id) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( use_task_queue, num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) execution_id = active_executions[0].id self._verify_exec_node_task(self._trainer, execution_id, tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # Dequeue the corresponding task if task queue is enabled. self._dequeue_and_test(use_task_queue, self._trainer, execution_id) # No more components to execute, FinalizePipelineTask should be generated. with self.subTest(generate=5): tasks, _ = self._generate_and_test( use_task_queue, num_initial_executions=3, num_tasks_generated=1, num_new_executions=0, num_active_executions=0) self.assertLen(tasks, 1) self.assertTrue(task_lib.is_finalize_pipeline_task(tasks[0])) self.assertEqual(status_lib.Code.OK, tasks[0].status.code) if use_task_queue: self.assertTrue(self._task_queue.is_empty())
def test_tasks_generated_when_upstream_done(self): # Simulate that ExampleGen has already completed successfully. otu.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Before generation, there's 1 execution. with self._mlmd_connection as m: executions = m.store.get_executions() self.assertLen(executions, 1) # Generate once. with self.subTest(generate=1): tasks, active_executions = self._generate_and_test( num_initial_executions=1, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Should be fine to regenerate multiple times. There should be no new # effects. with self.subTest(generate=2): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) with self.subTest(generate=3): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=0, num_active_executions=1) self._verify_node_execution_task(self._transform, active_executions[0], tasks[0]) # Mark transform execution complete. otu.fake_transform_output(self._mlmd_connection, self._transform, active_executions[0]) # Trainer execution task should be generated when generate called again. with self.subTest(generate=4): tasks, active_executions = self._generate_and_test( num_initial_executions=2, num_tasks_generated=1, num_new_executions=1, num_active_executions=1) self._verify_node_execution_task(self._trainer, active_executions[0], tasks[0]) # Mark the trainer execution complete. otu.fake_trainer_output(self._mlmd_connection, self._trainer, active_executions[0]) # No more components to execute so no tasks are generated. with self.subTest(generate=5): self._generate_and_test(num_initial_executions=3, num_tasks_generated=0, num_new_executions=0, num_active_executions=0)