def testTaxiPipelineBeam(self): beam_pipeline_args = self._make_beam_pipeline_args() BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=beam_pipeline_args)) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(9, execution_count) self.assertPipelineExecution() # Runs pipeline the second time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=beam_pipeline_args)) # All executions but Evaluator and Pusher are cached. # Note that Resolver will always execute. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertLen(m.store.get_artifacts(), artifact_count + 3) artifact_count = len(m.store.get_artifacts()) self.assertLen(m.store.get_executions(), 18) # Runs pipeline the third time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=beam_pipeline_args)) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertLen(m.store.get_artifacts(), artifact_count) self.assertLen(m.store.get_executions(), 27)
def testTaxiPipelineBeam(self): BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) self.assertTrue(tf.io.gfile.exists(self._serving_model_dir)) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(9, execution_count) self.assertPipelineExecution() # Runs pipeline the second time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) # All executions but ModelValidator and Pusher are cached. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 2 caused by ModelValidator and Pusher. self.assertEqual(artifact_count + 2, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) # 9 more cached executions. self.assertEqual(18, len(m.store.get_executions())) # Runs pipeline the third time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) # 9 more cached executions. self.assertEqual(27, len(m.store.get_executions()))
def setUp(self): super(TaxiPipelineRegressionEndToEndTest, self).setUp() self._test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._pipeline_name = 'beam_stub_test' # This example assumes that the taxi data and taxi utility function are # stored in tfx/examples/chicago_taxi_pipeline. Feel free to customize this # as needed. taxi_root = os.path.dirname(taxi_pipeline_beam.__file__) self._data_root = os.path.join(taxi_root, 'data', 'simple') self._module_file = os.path.join(taxi_root, 'taxi_utils.py') self._serving_model_dir = os.path.join(self._test_dir, 'serving_model') self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines', self._pipeline_name) # Metadata path for recording successful pipeline run. self._recorded_mlmd_path = os.path.join(self._test_dir, 'tfx', 'record', 'metadata.db') # Metadata path for stub pipeline runs. self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata', self._pipeline_name, 'metadata.db') self._recorded_output_dir = os.path.join(self._test_dir, 'testdata') # Runs the pipeline and record to self._recorded_output_dir record_taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._recorded_mlmd_path, beam_pipeline_args=[]) local_dag_runner.LocalDagRunner().run(record_taxi_pipeline) pipeline_recorder_utils.record_pipeline( output_dir=self._recorded_output_dir, metadata_db_uri=self._recorded_mlmd_path, pipeline_name=self._pipeline_name) self.taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])
def testTaxiPipelineCheckDagConstruction(self): logical_pipeline = taxi_pipeline_beam._create_pipeline( pipeline_name='Test', pipeline_root=self._test_dir, data_root=self._test_dir, module_file=self._test_dir, serving_model_dir=self._test_dir, metadata_path=self._test_dir) self.assertEqual(9, len(logical_pipeline.components))
def testTaxiPipelineBeam(self): BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) self.assertTrue(tf.io.gfile.exists(self._serving_model_dir)) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(9, execution_count) self.assertPipelineExecution() # Run pipeline again. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) # Assert cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) # 9 more cached executions. self.assertEqual(18, len(m.store.get_executions())) self.assertPipelineExecution()
def testTaxiPipelineBeam(self): # Runs the pipeline and record to self._recorded_output_dir record_taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._recorded_mlmd_path, beam_pipeline_args=[]) BeamDagRunner().run(record_taxi_pipeline) pipeline_recorder_utils.record_pipeline( output_dir=self._recorded_output_dir, metadata_db_uri=self._recorded_mlmd_path, host=None, port=None, pipeline_name=self._pipeline_name, run_id=None) # Run pipeline with stub executors. taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[]) model_resolver_id = 'ResolverNode.latest_blessed_model_resolver' stubbed_component_ids = [ component.id for component in taxi_pipeline.components if component.id != model_resolver_id ] stub_launcher = stub_component_launcher.get_stub_launcher_class( test_data_dir=self._recorded_output_dir, stubbed_component_ids=stubbed_component_ids, stubbed_component_map={}) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_launcher, ]) BeamDagRunner(config=stub_pipeline_config).run(taxi_pipeline) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 3 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 3) self.assertLen(taxi_pipeline.components, execution_count) for execution in executions: component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id == 'ResolverNode.latest_blessed_model_resolver': continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx)))
def testTaxiPipelineBeam(self): num_components = 10 BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) self.assertTrue(tf.io.gfile.exists(self._serving_model_dir)) self.assertTrue(tf.io.gfile.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: artifact_count = len(m.store.get_artifacts()) execution_count = len(m.store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(num_components, execution_count) self.assertPipelineExecution() self.assertInfraValidatorPassed() # Runs pipeline the second time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) # All executions but Evaluator and Pusher are cached. # Note that Resolver will always execute. with metadata.Metadata(metadata_config) as m: # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertEqual(artifact_count + 3, len(m.store.get_artifacts())) artifact_count = len(m.store.get_artifacts()) # 10 more cached executions. self.assertEqual(num_components * 2, len(m.store.get_executions())) # Runs pipeline the third time. BeamDagRunner().run( taxi_pipeline_beam._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, direct_num_workers=1)) # Asserts cache execution. with metadata.Metadata(metadata_config) as m: # Artifact count is unchanged. self.assertEqual(artifact_count, len(m.store.get_artifacts())) # 10 more cached executions. self.assertEqual(num_components * 3, len(m.store.get_executions()))