def testRegisterExecutionBackwardCompatibility(self): with metadata.Metadata(connection_config=self._connection_config) as m: context_id = m.register_run_context_if_not_exists( self._pipeline_info) # Puts in execution with more columns needed in MLMD schema first and # puts in execution with less columns needed next. Verifies the schema # update will not affect backward compatibility. exec_properties_one = {'arg_one': 1} exec_properties_two = {'arg_one': 1, 'arg_two': 2} eid_two = m.register_execution(exec_properties=exec_properties_two, pipeline_info=self._pipeline_info, component_info=self._component_info, run_context_id=context_id) eid_one = m.register_execution(exec_properties=exec_properties_one, pipeline_info=self._pipeline_info, component_info=self._component_info, run_context_id=context_id) [execution_one, execution_two] = m.store.get_executions_by_id([eid_one, eid_two]) self.assertProtoEquals( """ id: 2 type_id: 2 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } }""", execution_one) self.assertProtoEquals( """ id: 1 type_id: 2 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } } properties { key: "arg_two" value { string_value: "2" } }""", execution_two)
def testIrisPipelineSklearnGcp(self, mock_pusher, mock_trainer, _): mock_pusher.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_trainer.get_service_name_and_api_version.return_value = ('ml', 'v1') BeamDagRunner().run( iris_pipeline_sklearn_gcp._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, ai_platform_training_args=self._ai_platform_training_args, ai_platform_serving_args=self._ai_platform_serving_args, beam_pipeline_args=[])) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) mock_trainer.start_aip_training.assert_called_once() mock_pusher.deploy_model_for_aip_prediction.assert_called_once() expected_execution_count = 6 # 6 components metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self.assertPipelineExecution() # Runs pipeline the second time. BeamDagRunner().run( iris_pipeline_sklearn_gcp._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, ai_platform_training_args=self._ai_platform_training_args, ai_platform_serving_args=self._ai_platform_serving_args, beam_pipeline_args=[])) # All executions but Evaluator and Pusher are cached. with metadata.Metadata(metadata_config) as m: self.assertEqual(artifact_count, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) self.assertEqual(expected_execution_count * 2, len(m.store.get_executions())) # Runs pipeline the third time. BeamDagRunner().run( iris_pipeline_sklearn_gcp._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, ai_platform_training_args=self._ai_platform_training_args, ai_platform_serving_args=self._ai_platform_serving_args, beam_pipeline_args=[])) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) self.assertEqual(expected_execution_count * 3, len(m.store.get_executions()))
def testExecution(self): with metadata.Metadata(connection_config=self._connection_config) as m: context_id = m.register_run_context_if_not_exists( self._pipeline_info) # Test prepare_execution. exec_properties = {'arg_one': 1} eid = m.register_execution(exec_properties=exec_properties, pipeline_info=self._pipeline_info, component_info=self._component_info, run_context_id=context_id) [execution] = m.store.get_executions_by_context(context_id) self.assertProtoEquals( """ id: 1 type_id: 2 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } }""", execution) # Test publish_execution. input_artifact = standard_artifacts.Examples() m.publish_artifacts([input_artifact]) output_artifact = standard_artifacts.Examples() input_dict = {'input': [input_artifact]} output_dict = {'output': [output_artifact]} m.publish_execution(eid, input_dict, output_dict) # Make sure artifacts in output_dict are published. self.assertEqual(ArtifactState.PUBLISHED, output_artifact.state) # Make sure execution state are changed. [execution] = m.store.get_executions_by_id([eid]) self.assertEqual(metadata.EXECUTION_STATE_COMPLETE, execution.properties['state'].string_value) # Make sure events are published. events = m.store.get_events_by_execution_ids([eid]) self.assertEqual(2, len(events)) self.assertEqual(input_artifact.id, events[0].artifact_id) self.assertEqual(metadata_store_pb2.Event.INPUT, events[0].type) self.assertProtoEquals( """ steps { key: "input" } steps { index: 0 }""", events[0].path) self.assertEqual(output_artifact.id, events[1].artifact_id) self.assertEqual(metadata_store_pb2.Event.OUTPUT, events[1].type) self.assertProtoEquals( """ steps { key: "output" } steps { index: 0 }""", events[1].path)
def run_with_ir( self, pipeline: pipeline_pb2.Pipeline, run_options: Optional[pipeline_pb2.RunOptions] = None, ) -> None: """Runs given pipeline locally. Args: pipeline: Pipeline IR containing pipeline args and components. run_options: Optional args for the run. Raises: ValueError: If run_options is provided, and partial_run_options.from_nodes and partial_run_options.to_nodes are both empty. """ # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: datetime.datetime.now().isoformat(), }) deployment_config = runner_utils.extract_local_deployment_config(pipeline) connection_config = getattr( deployment_config.metadata_connection_config, deployment_config.metadata_connection_config.WhichOneof( 'connection_config')) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) if run_options: logging.info('Using run_options:\n %s', run_options) pr_opts = run_options.partial_run partial_run_utils.mark_pipeline( pipeline, from_nodes=pr_opts.from_nodes or None, to_nodes=pr_opts.to_nodes or None, snapshot_settings=pr_opts.snapshot_settings) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id if pipeline_node.execution_options.HasField('skip'): logging.info('Skipping component %s.', node_id) continue executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec) logging.info('Component %s is running.', node_id) if pipeline_node.execution_options.run.perform_snapshot: with metadata.Metadata(connection_config) as mlmd_handle: partial_run_utils.snapshot(mlmd_handle, pipeline) component_launcher.launch() logging.info('Component %s is finished.', node_id)
def testStubbedTaxiPipelineBeam(self): pipeline_ir = compiler.Compiler().compile(self.taxi_pipeline) logging.info('Replacing with test_data_dir:%s', self._recorded_output_dir) pipeline_mock.replace_executor_with_stub(pipeline_ir, self._recorded_output_dir, []) BeamDagRunner().run(pipeline_ir) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 7 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache, pre_transform_statistics, # pre_transform_schema, post_transform_statistics, post_transform_schema, # post_transform_anomalies) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 7) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = pipeline_recorder_utils.get_component_id_from_execution( m, execution) if component_id.startswith('Resolver'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self._pipeline_name) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation, # A subdirectory of updated_analyzer_cache has changing name. 'updated_analyzer_cache': self._veryify_root_dir, } # List of components to verify. Resolver is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('Resolver') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def metadata(self): connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() return metadata.Metadata(connection_config=connection_config)
def testStubbedTaxiPipelineBeam(self): # Run pipeline with stub executors. stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self._recorded_output_dir, test_component_ids=[]) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_component_launcher.StubComponentLauncher, ]) BeamDagRunner(config=stub_pipeline_config).run(self.taxi_pipeline) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 3 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 3) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self.taxi_pipeline.pipeline_info) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def _publish_execution_to_metadata(self): with metadata.Metadata(self._metadata_connection_config, self._logger) as m: return m.publish_execution(self._execution_id, self._input_dict, self._output_dict)
def testPenguinPipelineSklearnLocal(self): LocalDagRunner().run( penguin_pipeline_sklearn_local._create_pipeline( pipeline_name=self._pipeline_name, pipeline_root=self._pipeline_root, data_root=self._data_root, trainer_module_file=self._trainer_module_file, evaluator_module_file=self._evaluator_module_file, serving_model_dir=self._serving_model_dir, metadata_path=self._metadata_path, beam_pipeline_args=[])) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 8 # 7 components + 1 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self.assertPipelineExecution() # Runs pipeline the second time. LocalDagRunner().run( penguin_pipeline_sklearn_local._create_pipeline( pipeline_name=self._pipeline_name, pipeline_root=self._pipeline_root, data_root=self._data_root, trainer_module_file=self._trainer_module_file, evaluator_module_file=self._evaluator_module_file, serving_model_dir=self._serving_model_dir, metadata_path=self._metadata_path, beam_pipeline_args=[])) with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertEqual(artifact_count + 3, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) self.assertEqual(expected_execution_count * 2, len(m.store.get_executions())) # Runs pipeline the third time. LocalDagRunner().run( penguin_pipeline_sklearn_local._create_pipeline( pipeline_name=self._pipeline_name, pipeline_root=self._pipeline_root, data_root=self._data_root, trainer_module_file=self._trainer_module_file, evaluator_module_file=self._evaluator_module_file, serving_model_dir=self._serving_model_dir, metadata_path=self._metadata_path, beam_pipeline_args=[])) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) self.assertEqual(expected_execution_count * 3, len(m.store.get_executions()))
def testStubbedImdbPipelineBeam(self): # Runs the pipeline and record to self._recorded_output_dir stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self._recorded_output_dir, test_component_ids=[]) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_component_launcher.StubComponentLauncher, ]) local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run( self.imdb_pipeline) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: for execution in m.store.get_executions(): component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps assert steps[0].HasField('key') name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. local_dag_runner.LocalDagRunner().run(self.imdb_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.imdb_pipeline.metadata_connection_config, self.imdb_pipeline.pipeline_info) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.imdb_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): logging.info('Verifying %s', component_id) recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def testRegisterExecutionBackwardCompatibility(self): with metadata.Metadata(connection_config=self._connection_config) as m: contexts = m.register_pipeline_contexts_if_not_exists( self._pipeline_info) # Puts in execution with more columns needed in MLMD schema first and # puts in execution with less columns needed next. Verifies the schema # update will not affect backward compatibility. exec_properties_one = {'arg_one': 1, 'arg_two': 2} exec_properties_two = {'arg_one': 1} execution_one = m.register_execution( input_artifacts={}, exec_properties=exec_properties_one, pipeline_info=self._pipeline_info, component_info=self._component_info, contexts=contexts) execution_two = m.register_execution( input_artifacts={}, exec_properties=exec_properties_two, pipeline_info=self._pipeline_info, component_info=self._component_info3, contexts=contexts) [execution_one, execution_two] = m.store.get_executions_by_id( [execution_one.id, execution_two.id]) # Skip verifying time sensitive fields. execution_one.ClearField('create_time_since_epoch') execution_one.ClearField('last_update_time_since_epoch') self.assertProtoEquals( """ id: 1 type_id: 3 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } } properties { key: "arg_two" value { string_value: "2" } }""", execution_one) # Skip verifying time sensitive fields. execution_two.ClearField('create_time_since_epoch') execution_two.ClearField('last_update_time_since_epoch') self.assertProtoEquals( """ id: 2 type_id: 3 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } }""", execution_two)
def testTaxiPipelineBeam(self): num_components = 10 BeamDagRunner().run( taxi_pipeline_infraval_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(num_components, execution_count) self.assertPipelineExecution() self.assertInfraValidatorPassed() # Runs pipeline the second time. BeamDagRunner().run( taxi_pipeline_infraval_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # All executions but Evaluator and Pusher are cached. # Note that Resolver will always execute. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertLen(m.store.get_artifacts(), artifact_count + 3) artifact_count = len(m.store.get_artifacts()) # 10 more cached executions. self.assertLen(m.store.get_executions(), num_components * 2) # Runs pipeline the third time. BeamDagRunner().run( taxi_pipeline_infraval_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertLen(m.store.get_artifacts(), artifact_count) # 10 more cached executions. self.assertLen(m.store.get_executions(), num_components * 3)
def testTaxiPipelineWithImporter(self): BeamDagRunner().run( taxi_pipeline_importer._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, user_schema_path=self._user_schema_path, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(10, execution_count) self.assertPipelineExecution() # Runs the pipeline again. BeamDagRunner().run( taxi_pipeline_importer._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, user_schema_path=self._user_schema_path, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # All executions but Evaluator and Pusher are cached. # Note that Resolver will always execute. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertEqual(artifact_count + 3, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) self.assertEqual(20, len(m.store.get_executions())) # Runs the pipeline the third time. BeamDagRunner().run( taxi_pipeline_importer._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, user_schema_path=self._user_schema_path, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) self.assertEqual(30, len(m.store.get_executions()))
def testRegisterExecutionUpdatedExecutionType(self): with metadata.Metadata(connection_config=self._connection_config) as m: contexts_one = m.register_pipeline_contexts_if_not_exists( self._pipeline_info) contexts_two = m.register_pipeline_contexts_if_not_exists( self._pipeline_info3) # Puts in execution with less columns needed in MLMD schema first and # puts in execution with more columns needed next. Verifies the schema # update will not be breaking change. exec_properties_one = {'arg_one': 1} exec_properties_two = {'arg_one': 1, 'arg_two': 2} execution_one = m.register_execution( input_artifacts={}, exec_properties=exec_properties_one, pipeline_info=self._pipeline_info, component_info=self._component_info, contexts=contexts_one) execution_two = m.register_execution( input_artifacts={}, exec_properties=exec_properties_two, pipeline_info=self._pipeline_info3, component_info=self._component_info3, contexts=contexts_two) [execution_one, execution_two] = m.store.get_executions_by_id( [execution_one.id, execution_two.id]) self.assertProtoEquals( """ id: 1 type_id: 3 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } }""", execution_one) self.assertProtoEquals( """ id: 2 type_id: 3 properties { key: "state" value { string_value: "new" } } properties { key: "pipeline_name" value { string_value: "my_pipeline2" } } properties { key: "pipeline_root" value { string_value: "/tmp" } } properties { key: "run_id" value { string_value: "my_run_id" } } properties { key: "component_id" value { string_value: "my_component" } } properties { key: "arg_one" value { string_value: "1" } } properties { key: "arg_two" value { string_value: "2" } }""", execution_two)
def testPublishSuccessfulExecution(self): with metadata.Metadata(connection_config=self._connection_config) as m: contexts = self._generate_contexts(m) execution_id = execution_publish_utils.register_execution( m, self._execution_type, contexts).id output_key = 'examples' output_example = standard_artifacts.Examples() output_example.uri = '/examples_uri' executor_output = execution_result_pb2.ExecutorOutput() text_format.Parse( """ uri: '/examples_uri' custom_properties { key: 'prop' value {int_value: 1} } """, executor_output.output_artifacts[output_key].artifacts.add()) output_dict = execution_publish_utils.publish_succeeded_execution( m, execution_id, contexts, {output_key: [output_example]}, executor_output) [execution] = m.store.get_executions() self.assertProtoPartiallyEquals( """ id: 1 type_id: 3 last_known_state: COMPLETE """, execution, ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ]) [artifact] = m.store.get_artifacts() self.assertProtoPartiallyEquals( """ id: 1 type_id: 4 state: LIVE uri: '/examples_uri' custom_properties { key: 'prop' value {int_value: 1} }""", artifact, ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ]) [event] = m.store.get_events_by_execution_ids([execution.id]) self.assertProtoPartiallyEquals( """ artifact_id: 1 execution_id: 1 path { steps { key: 'examples' } steps { index: 0 } } type: OUTPUT """, event, ignored_fields=['milliseconds_since_epoch']) # Verifies the context-execution edges are set up. self.assertCountEqual( [c.id for c in contexts], [c.id for c in m.store.get_contexts_by_execution(execution.id)]) for artifact_list in output_dict.values(): for output_example in artifact_list: self.assertCountEqual([c.id for c in contexts], [ c.id for c in m.store.get_contexts_by_artifact(output_example.id) ])
def testIrisPipelineNativeKeras(self): BeamDagRunner().run( iris_pipeline_native_keras_infraval._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) self.assertTrue(tf.io.gfile.exists(self._serving_model_dir)) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) expected_execution_count = 10 # 9 components + 1 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self.assertPipelineExecution() self.assertInfraValidatorPassed() # Runs pipeline the second time. BeamDagRunner().run( iris_pipeline_native_keras_infraval._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # All executions but Evaluator and Pusher are cached. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertEqual(artifact_count + 3, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) self.assertEqual(expected_execution_count * 2, len(m.store.get_executions())) # Runs pipeline the third time. BeamDagRunner().run( iris_pipeline_native_keras_infraval._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) self.assertEqual(expected_execution_count * 3, len(m.store.get_executions()))
def testPublishSuccessExecutionExecutorEditedOutputDict(self): # There is one artifact in the system provided output_dict, while there are # two artifacts in executor output. We expect that two artifacts are # published. with metadata.Metadata(connection_config=self._connection_config) as m: contexts = self._generate_contexts(m) execution_id = execution_publish_utils.register_execution( m, self._execution_type, contexts).id output_example = standard_artifacts.Examples() output_example.uri = '/original_path' executor_output = execution_result_pb2.ExecutorOutput() output_key = 'examples' text_format.Parse( """ uri: '/original_path/subdir_1' custom_properties { key: 'prop' value {int_value: 1} } """, executor_output.output_artifacts[output_key].artifacts.add()) text_format.Parse( """ uri: '/original_path/subdir_2' custom_properties { key: 'prop' value {int_value: 2} } """, executor_output.output_artifacts[output_key].artifacts.add()) output_dict = execution_publish_utils.publish_succeeded_execution( m, execution_id, contexts, {output_key: [output_example]}, executor_output) [execution] = m.store.get_executions() self.assertProtoPartiallyEquals( """ id: 1 type_id: 3 last_known_state: COMPLETE """, execution, ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ]) artifacts = m.store.get_artifacts() self.assertLen(artifacts, 2) self.assertProtoPartiallyEquals( """ id: 1 type_id: 4 state: LIVE uri: '/original_path/subdir_1' custom_properties { key: 'prop' value {int_value: 1} }""", artifacts[0], ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ]) self.assertProtoPartiallyEquals( """ id: 2 type_id: 4 state: LIVE uri: '/original_path/subdir_2' custom_properties { key: 'prop' value {int_value: 2} }""", artifacts[1], ignored_fields=[ 'create_time_since_epoch', 'last_update_time_since_epoch' ]) events = m.store.get_events_by_execution_ids([execution.id]) self.assertLen(events, 2) self.assertProtoPartiallyEquals( """ artifact_id: 1 execution_id: 1 path { steps { key: 'examples' } steps { index: 0 } } type: OUTPUT """, events[0], ignored_fields=['milliseconds_since_epoch']) self.assertProtoPartiallyEquals( """ artifact_id: 2 execution_id: 1 path { steps { key: 'examples' } steps { index: 1 } } type: OUTPUT """, events[1], ignored_fields=['milliseconds_since_epoch']) # Verifies the context-execution edges are set up. self.assertCountEqual( [c.id for c in contexts], [c.id for c in m.store.get_contexts_by_execution(execution.id)]) for artifact_list in output_dict.values(): for output_example in artifact_list: self.assertCountEqual([c.id for c in contexts], [ c.id for c in m.store.get_contexts_by_artifact(output_example.id) ])
def setUp(self): super(TaskManagerE2ETest, self).setUp() pipeline_root = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self.id()) # Makes sure multiple connections within a test always connect to the same # MLMD instance. metadata_path = os.path.join(pipeline_root, 'metadata', 'metadata.db') self._metadata_path = metadata_path connection_config = metadata.sqlite_metadata_connection_config( metadata_path) connection_config.sqlite.SetInParent() self._mlmd_connection = metadata.Metadata( connection_config=connection_config) # Sets up the pipeline. pipeline = pipeline_pb2.Pipeline() self.load_proto_from_text( os.path.join(os.path.dirname(__file__), 'testdata', 'async_pipeline.pbtxt'), pipeline) # Extracts components. self._example_gen = pipeline.nodes[0].pipeline_node self._transform = pipeline.nodes[1].pipeline_node self._trainer = pipeline.nodes[2].pipeline_node # Pack deployment config for testing. deployment_config = pipeline_pb2.IntermediateDeploymentConfig() executor_spec = pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec( class_path='fake.ClassPath') deployment_config.executor_specs[self._trainer.node_info.id].Pack( executor_spec) deployment_config.executor_specs[self._transform.node_info.id].Pack( executor_spec) self._type_url = deployment_config.executor_specs[ self._trainer.node_info.id].type_url pipeline.deployment_config.Pack(deployment_config) self._pipeline = pipeline self._pipeline_info = pipeline.pipeline_info self._pipeline_runtime_spec = pipeline.runtime_spec self._pipeline_runtime_spec.pipeline_root.field_value.string_value = ( pipeline_root) ts.TaskSchedulerRegistry.clear() self._task_queue = tq.TaskQueue() # Run fake example-gen to prepare downstreams component triggers. test_utils.fake_example_gen_run(self._mlmd_connection, self._example_gen, 1, 1) # Task generator should produce a task to run transform. with self._mlmd_connection as m: tasks = asptg.AsyncPipelineTaskGenerator( m, self._pipeline, self._task_queue.contains_task_id).generate() self.assertLen(tasks, 1) task = tasks[0] self.assertEqual('my_transform', task.node_uid.node_id) # Task generator should produce a task to run transform. with self._mlmd_connection as m: tasks = asptg.AsyncPipelineTaskGenerator( m, self._pipeline, self._task_queue.contains_task_id).generate() self.assertLen(tasks, 1) self._task = tasks[0] self.assertEqual('my_transform', self._task.node_uid.node_id) self._task_queue.enqueue(self._task) # There should be 1 active execution in MLMD. with self._mlmd_connection as m: executions = m.store.get_executions() active_executions = [ e for e in executions if e.last_known_state == metadata_store_pb2.Execution.RUNNING ] self.assertLen(active_executions, 1) # Active execution id. self._execution_id = active_executions[0].id