def run(self, pipeline: pipeline_py.Pipeline) -> None: """Runs given logical pipeline locally. Args: pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return for component in pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access pipeline.pipeline_info.pipeline_root) c = compiler.Compiler() pipeline = c.compile(pipeline) # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: datetime.datetime.now().isoformat(), }) deployment_config = runner_utils.extract_local_deployment_config(pipeline) connection_config = deployment_config.metadata_connection_config logging.info('Running pipeline:\n %s', pipeline) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec) logging.info('Component %s is running.', node_id) component_launcher.launch() logging.info('Component %s is finished.', node_id)
def create_pipeline() -> pipeline_pb2.Pipeline: """Creates an async pipeline for testing.""" # pylint: disable=no-value-for-parameter example_gen = _example_gen().with_id('my_example_gen') transform = _transform(examples=example_gen.outputs['examples'], a_param=10).with_id('my_transform') trainer = _trainer( examples=example_gen.outputs['examples'], transform_graph=transform.outputs['transform_graph']).with_id( 'my_trainer') # pylint: enable=no-value-for-parameter pipeline = pipeline_lib.Pipeline( pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[ example_gen, transform, trainer, ], execution_mode=pipeline_lib.ExecutionMode.ASYNC) dsl_compiler = compiler.Compiler() compiled_pipeline: pipeline_pb2.Pipeline = dsl_compiler.compile(pipeline) # Compiler does not support setting min_count yet, so we mutate the proto # explicitly for testing. trainer = compiled_pipeline.nodes[2].pipeline_node assert trainer.node_info.id == 'my_trainer' for value in trainer.inputs.inputs.values(): value.min_count = 1 return compiled_pipeline
def setUp(self): super().setUp() temp_dir = self.get_temp_dir() self.pipeline_root = os.path.join(temp_dir, 'pipeline') self.metadata_conn_config = metadata.sqlite_metadata_connection_config( os.path.join(temp_dir, 'metadata', 'metadata.db')) self.compiler = compiler.Compiler()
def testCompile(self, pipeline_module, expected_result_path): """Tests compiling the whole pipeline.""" dsl_compiler = compiler.Compiler() compiled_pb = dsl_compiler.compile( self._get_test_pipeline_definition(pipeline_module)) expected_pb = self._get_test_pipeline_pb(expected_result_path) self.assertProtoEquals(expected_pb, compiled_pb)
def testCompileImporterAdditionalPropertyTypeError(self): dsl_compiler = compiler.Compiler() test_pipeline = self._get_test_pipeline_definition(iris_pipeline_async) impt = next(c for c in test_pipeline.components if compiler_utils.is_importer(c)) impt.exec_properties[importer.PROPERTIES_KEY]["split_names"] = 2.1 with self.assertRaisesRegex(TypeError, "Expected STRING but given DOUBLE"): dsl_compiler.compile(test_pipeline)
def create_pipeline() -> pipeline_pb2.Pipeline: """Builds a test pipeline with only manual node.""" manual = manual_node.ManualNode(description='Do something.') pipeline = pipeline_lib.Pipeline(pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[manual], enable_cache=True) dsl_compiler = compiler.Compiler() return dsl_compiler.compile(pipeline)
def testCompileAdditionalPropertyTypeError(self): dsl_compiler = compiler.Compiler() test_pipeline = self._get_test_pipeline_definition( additional_properties_test_pipeline_async) custom_producer = next( c for c in test_pipeline.components if isinstance( c, additional_properties_test_pipeline_async.CustomProducer)) custom_producer.outputs["stats"].additional_properties[ "span"] = "wrong_type" with self.assertRaisesRegex(TypeError, "Expected INT but given STRING"): dsl_compiler.compile(test_pipeline)
def create_pipeline() -> pipeline_pb2.Pipeline: """Builds a test pipeline.""" # pylint: disable=no-value-for-parameter example_gen = _example_gen().with_id('my_example_gen') stats_gen = _statistics_gen( examples=example_gen.outputs['examples']).with_id('my_statistics_gen') schema_gen = _schema_gen( statistics=stats_gen.outputs['statistics']).with_id('my_schema_gen') example_validator = _example_validator( statistics=stats_gen.outputs['statistics'], schema=schema_gen.outputs['schema']).with_id('my_example_validator') transform = _transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema']).with_id('my_transform') trainer = _trainer( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph']).with_id( 'my_trainer') # Nodes with no input or output specs for testing task only dependencies. chore_a = _chore().with_id('chore_a') chore_a.add_upstream_node(trainer) chore_b = _chore().with_id('chore_b') chore_b.add_upstream_node(chore_a) with conditional.Cond( trainer.outputs['model'].future()[0].custom_property('evaluate') == 1): evaluator = _evaluator( model=trainer.outputs['model']).with_id('my_evaluator') # pylint: enable=no-value-for-parameter pipeline = pipeline_lib.Pipeline( pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[ example_gen, stats_gen, schema_gen, example_validator, transform, trainer, evaluator, chore_a, chore_b, ], enable_cache=True) dsl_compiler = compiler.Compiler() return dsl_compiler.compile(pipeline)
def _make_pipeline_proto( pipeline: pipeline_py.Pipeline) -> pipeline_pb2.Pipeline: """Resolve pip dependencies and compile Pipeline object.""" if isinstance(pipeline, pipeline_pb2.Pipeline): raise ValueError( 'The "run" method, which is only meant for running Pipeline objects, ' 'was called with a Pipeline IR. Did you mean to call the ' '"run_with_ir" method instead?') for component in pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access pipeline.pipeline_info.pipeline_root) return compiler.Compiler().compile(pipeline)
def testPatcher(self, use_pipeline_proto, mock_run): patcher = _DummyDagRunnerPatcher(self) pipeline = tfx_pipeline.Pipeline(_PIPELINE_NAME, 'dummy_root') if use_pipeline_proto: pipeline = compiler.Compiler().compile(pipeline) runner = _DummyDagRunner() with patcher.patch() as context: self.assertNotIn('foo', context) self.assertFalse(patcher.run_called) runner.run(pipeline) print(context) self.assertEqual(context['foo'], 24) self.assertTrue(patcher.run_called) mock_run.assert_called_once()
def testCompileDynamicExecPropTypeError(self): dsl_compiler = compiler.Compiler() test_pipeline = self._get_test_pipeline_definition( dynamic_exec_properties_pipeline) downstream_component = next( c for c in test_pipeline.components if isinstance(c, dynamic_exec_properties_pipeline.DownstreamComponent)) instance_a = _MyType() instance_b = _MyType() test_wrong_type_channel = channel.Channel(_MyType).set_artifacts( [instance_a, instance_b]).future() downstream_component.exec_properties["input_num"] = test_wrong_type_channel with self.assertRaisesRegex( ValueError, "output channel to dynamic exec properties is not ValueArtifact"): dsl_compiler.compile(test_pipeline)
def create_pipeline() -> pipeline_pb2.Pipeline: """Creates a pipeline with an importer node for testing.""" inode = importer.Importer( source_uri='my_url', reimport=True, custom_properties={ 'int_custom_property': 123, 'str_custom_property': 'abc', }, artifact_type=standard_artifacts.Schema).with_id('my_importer') pipeline = pipeline_lib.Pipeline( pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[inode], execution_mode=pipeline_lib.ExecutionMode.SYNC) dsl_compiler = compiler.Compiler() return dsl_compiler.compile(pipeline)
def __init__( self, output_dir: Optional[Text] = None, output_filename: Optional[Text] = None, config: Optional[KubeflowDagRunnerConfig] = None, pod_labels_to_attach: Optional[Dict[Text, Text]] = None ): """Initializes KubeflowDagRunner for compiling a Kubeflow Pipeline. Args: output_dir: An optional output directory into which to output the pipeline definition files. Defaults to the current working directory. output_filename: An optional output file name for the pipeline definition file. Defaults to pipeline_name.tar.gz when compiling a TFX pipeline. Currently supports .tar.gz, .tgz, .zip, .yaml, .yml formats. See https://github.com/kubeflow/pipelines/blob/181de66cf9fa87bcd0fe9291926790c400140783/sdk/python/kfp/compiler/compiler.py#L851 for format restriction. config: An optional KubeflowDagRunnerConfig object to specify runtime configuration when running the pipeline under Kubeflow. pod_labels_to_attach: Optional set of pod labels to attach to GKE pod spinned up for this pipeline. Default to the 3 labels: 1. add-pod-env: true, 2. pipeline SDK type, 3. pipeline unique ID, where 2 and 3 are instrumentation of usage tracking. """ if config and not isinstance(config, KubeflowDagRunnerConfig): raise TypeError('config must be type of KubeflowDagRunnerConfig.') super(KubeflowDagRunner, self).__init__(config or KubeflowDagRunnerConfig()) self._config = cast(KubeflowDagRunnerConfig, self._config) self._output_dir = output_dir or os.getcwd() self._output_filename = output_filename self._compiler = compiler.Compiler() self._tfx_compiler = tfx_compiler.Compiler() self._params = [] # List of dsl.PipelineParam used in this pipeline. self._deduped_parameter_names = set() # Set of unique param names used. if pod_labels_to_attach is None: self._pod_labels_to_attach = get_default_pod_labels() else: self._pod_labels_to_attach = pod_labels_to_attach
def create_pipeline() -> pipeline_pb2.Pipeline: """Creates a pipeline with a resolver node for testing.""" trainer = _trainer().with_id('my_trainer') # pylint: disable=no-value-for-parameter rnode = resolver.Resolver( strategy_class=latest_artifact_strategy.LatestArtifactStrategy, config={ 'desired_num_of_artifacts': 1 }, resolved_model=types.Channel( type=standard_artifacts.Model)).with_id('my_resolver') rnode.add_upstream_node(trainer) consumer = _consumer( resolved_model=rnode.outputs['resolved_model']).with_id('my_consumer') pipeline = pipeline_lib.Pipeline( pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[ trainer, rnode, consumer, ], execution_mode=pipeline_lib.ExecutionMode.SYNC) dsl_compiler = compiler.Compiler() return dsl_compiler.compile(pipeline)
def run(self, pipeline: Union[pipeline_pb2.Pipeline, pipeline_py.Pipeline]) -> None: """Deploys given logical pipeline on Beam. Args: pipeline: Logical pipeline in IR format. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return if isinstance(pipeline, pipeline_py.Pipeline): c = compiler.Compiler() pipeline = c.compile(pipeline) run_id = datetime.datetime.now().strftime('%Y%m%d-%H%M%S.%f') # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id, }) deployment_config = self._extract_deployment_config(pipeline) connection_config = self._connection_config_from_deployment_config( deployment_config) logging.info('Running pipeline:\n %s', pipeline) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline() as p: # Uses for triggering the node DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of node to its signal. signal_map = {} # pipeline.nodes are in topological order. for node in pipeline.nodes: # TODO(b/160882349): Support subpipeline pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = self._extract_executor_spec( deployment_config, node_id) custom_driver_spec = self._extract_custom_driver_spec( deployment_config, node_id) # Signals from upstream nodes. signals_to_wait = [] for upstream_node in pipeline_node.upstream_nodes: assert upstream_node in signal_map, ('Nodes are not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logging.info('Node %s depends on %s.', node_id, [s.producer.full_label for s in signals_to_wait]) # Each signal is an empty PCollection. AsIter ensures a node will # be triggered after upstream nodes are finished. signal_map[node_id] = ( root | 'Run[%s]' % node_id >> beam.ParDo( self._PIPELINE_NODE_DO_FN_CLS( pipeline_node=pipeline_node, mlmd_connection_config=connection_config, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, deployment_config=deployment_config), *[beam.pvalue.AsIter(s) for s in signals_to_wait])) logging.info('Node %s is scheduled.', node_id)
def testCompile(self): """Test compiling the whole pipeline.""" c = compiler.Compiler() compiled_pb = c.compile(self._pipeline) self.assertProtoEquals(self._pipeline_pb, compiled_pb)
def run(benchmarks: List[Benchmark], tfx_runner: Optional[tfx_runner_lib.TfxRunner] = None, pipeline_name: Optional[str] = None, pipeline_root: Optional[str] = None, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, enable_cache: Optional[bool] = False, beam_pipeline_args: Optional[List[str]] = None, **kwargs) -> BenchmarkPipeline: """Runs the given benchmarks as part of a single pipeline DAG. First it concatenates all the benchmark pipelines into a single DAG benchmark pipeline. Next it executes the workflow via tfx_runner.run(). When the `match` flag is set, matched benchmarks are filtered by name. When the `runs_per_benchmark` flag is set, each benchmark is run the number of times specified. Args: benchmarks: List of Benchmark instances to include in the suite. tfx_runner: The TfxRunner instance that defines the platform where benchmarks are run. pipeline_name: Name of the benchmark pipeline. pipeline_root: Path to root directory of the pipeline. metadata_connection_config: The config to connect to ML metadata. enable_cache: Whether or not cache is enabled for this run. beam_pipeline_args: Beam pipeline args for beam jobs within executor. Executor will use beam DirectRunner as Default. **kwargs: Additional kwargs forwarded as kwargs to benchmarks. Returns: Returns the BenchmarkPipeline that was passed to the tfx_runner. Raises: ValueError: If the given tfx_runner is not supported. """ if "compile_pipeline" in kwargs: kwargs.pop("compile_pipeline") logging.warning( "The `compile_pipeline` argument DEPRECATED and ignored. " "Pipelines are now automatically compiled.") runs_per_benchmark = FLAGS.runs_per_benchmark if runs_per_benchmark is None: runs_per_benchmark = int( os.environ.get("NITROML_RUNS_PER_BENCHMARK", 1)) if not tfx_runner: logging.info("Setting TFX runner to OSS default: BeamDagRunner.") tfx_runner = beam_dag_runner.BeamDagRunner() if runs_per_benchmark <= 0: raise ValueError( "runs_per_benchmark must be strictly positive; " f"got runs_per_benchmark={runs_per_benchmark} instead.") benchmark_subpipelines = [] for b in benchmarks: for benchmark_run in range(runs_per_benchmark): # Call benchmarks with pipeline args. spec = b(benchmark_run=benchmark_run + 1, runs_per_benchmark=runs_per_benchmark, **kwargs) for benchmark_subpipeline in spec.benchmark_subpipelines: if re.match(FLAGS.match, benchmark_subpipeline.id): benchmark_subpipelines.append(benchmark_subpipeline) if FLAGS.match and not benchmark_subpipelines: if spec.components_to_always_add: logging.info( "No benchmarks matched the pattern '%s'. " "Running components passed to self.add(..., always=True) only.", FLAGS.match) else: raise ValueError( f"No benchmarks matched the pattern '{FLAGS.match}'") benchmark_pipeline = BenchmarkPipeline( components_to_always_add=spec.components_to_always_add, benchmark_subpipelines=benchmark_subpipelines, pipeline_name=pipeline_name, pipeline_root=pipeline_root, metadata_connection_config=metadata_connection_config, enable_cache=enable_cache, beam_pipeline_args=beam_pipeline_args, **kwargs) logging.info("NitroML benchmarks:") for benchmark_name in benchmark_pipeline.benchmark_names: logging.info("\t%s", benchmark_name) logging.info("\t\tRUNNING") dsl_compiler = compiler.Compiler() pipeline_to_run = dsl_compiler.compile(benchmark_pipeline) if spec.requested_partial_run: logging.info("Only running the following nodes:\n%s", "\n".join(spec.nodes_to_partial_run)) pipeline_to_run = pipeline_filtering.filter_pipeline( input_pipeline=pipeline_to_run, pipeline_run_id_fn=( pipeline_filtering.make_latest_resolver_pipeline_run_id_fn( benchmark_pipeline.metadata_connection_config)), skip_nodes=lambda x: x not in set(spec.nodes_to_partial_run)) tfx_runner.run(pipeline_to_run) return benchmark_pipeline
def run( self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None ) -> None: """Runs given logical pipeline locally. Args: pipeline: Logical pipeline containing pipeline args and components. run_name: Optional name for the run. """ for component in pipeline.components: if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( pipeline.pipeline_info.pipeline_root ) c = compiler.Compiler() pipeline = c.compile(pipeline) run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f") # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { PIPELINE_RUN_ID_PARAMETER_NAME: run_name, }, ) deployment_config = runner_utils.extract_local_deployment_config( pipeline ) connection_config = deployment_config.metadata_connection_config # type: ignore[attr-defined] # noqa logger.debug(f"Using deployment config:\n {deployment_config}") logger.debug(f"Using connection config:\n {connection_config}") # Run each component. Note that the pipeline.components list is in # topological order. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id ) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id ) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, ) start = time.time() logger.info(f"Step `{node_id}` has started.") component_launcher.launch() end = time.time() logger.info( f"Step `{node_id}` has finished" f" in {format_timedelta_pretty(end - start)}." )
def run(self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None) -> "airflow.DAG": """Deploys given logical pipeline on Airflow. Args: pipeline: Logical pipeline containing pipeline args and comps. run_name: Optional name for the run. Returns: An Airflow DAG. """ # Only import these when needed. import airflow # noqa from zenml.integrations.airflow.orchestrators import airflow_component # Merge airflow-specific configs with pipeline args airflow_dag = airflow.DAG( dag_id=pipeline.pipeline_info.pipeline_name, **(typing.cast(AirflowPipelineConfig, self._config).airflow_dag_config), is_paused_upon_creation=False, catchup=False, # no backfill ) if "tmp_dir" not in pipeline.additional_pipeline_args: tmp_dir = os.path.join(pipeline.pipeline_info.pipeline_root, ".temp", "") pipeline.additional_pipeline_args["tmp_dir"] = tmp_dir for component in pipeline.components: if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( pipeline.pipeline_info.pipeline_root) self._replace_runtime_params(component) c = compiler.Compiler() pipeline = c.compile(pipeline) run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f") # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { "pipeline-run-id": run_name, }, ) deployment_config = runner_utils.extract_local_deployment_config( pipeline) connection_config = deployment_config.metadata_connection_config # type: ignore[attr-defined] # noqa component_impl_map = {} for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) current_airflow_component = airflow_component.AirflowComponent( parent_dag=airflow_dag, pipeline_node=pipeline_node, mlmd_connection=connection_config, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, ) component_impl_map[node_id] = current_airflow_component for upstream_node in node.pipeline_node.upstream_nodes: assert (upstream_node in component_impl_map ), "Components is not in topological order" current_airflow_component.set_upstream( component_impl_map[upstream_node]) return airflow_dag
def testStubbedImdbPipelineBeam(self): pipeline_ir = compiler.Compiler().compile(self.imdb_pipeline) pipeline_mock.replace_executor_with_stub(pipeline_ir, self._recorded_output_dir, []) BeamDagRunner().run(pipeline_ir) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: for execution in m.store.get_executions(): component_id = pipeline_recorder_utils.get_component_id_from_execution( m, execution) if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps assert steps[0].HasField('key') name = steps[0].key artifacts = m.store.get_artifacts_by_id([event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.imdb_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.imdb_pipeline.metadata_connection_config, self._pipeline_name) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation, # A subdirectory of updated_analyzer_cache has changing name. 'updated_analyzer_cache': self._veryify_root_dir, } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.imdb_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): logging.info('Verifying %s', component_id) recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def _getTestPipelineIR(self) -> pipeline_pb2.Pipeline: test_pipeline = self._getTestPipeline() c = compiler.Compiler() return c.compile(test_pipeline)
def testStubbedTaxiPipelineBeam(self): pipeline_ir = compiler.Compiler().compile(self.taxi_pipeline) logging.info('Replacing with test_data_dir:%s', self._recorded_output_dir) pipeline_mock.replace_executor_with_stub(pipeline_ir, self._recorded_output_dir, []) BeamDagRunner().run_with_ir(pipeline_ir) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 7 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache, pre_transform_statistics, # pre_transform_schema, post_transform_statistics, post_transform_schema, # post_transform_anomalies) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 7) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = pipeline_recorder_utils.get_component_id_from_execution( m, execution) if component_id.startswith('Resolver'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. BeamDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self._pipeline_name) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation, # A subdirectory of updated_analyzer_cache has changing name. 'updated_analyzer_cache': self._veryify_root_dir, } # List of components to verify. Resolver is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('Resolver') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
def _getTestPipelineIR(self) -> pipeline_pb2.Pipeline: # pylint: disable=invalid-name test_pipeline = self._getTestPipeline() c = compiler.Compiler() return c.compile(test_pipeline)