def testLegacyBeamDagRunnerConstruction(self): self.assertIsInstance(beam_dag_runner.BeamDagRunner(), beam_dag_runner.BeamDagRunner) # Test that the legacy Beam DAG runner is used when a PipelineConfig is # specified. config = pipeline_config.PipelineConfig() runner = beam_dag_runner.BeamDagRunner(config=config) self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner) self.assertIs(runner._config, config) # Test that the legacy Beam DAG runner is used when beam_orchestrator_args # is specified. beam_orchestrator_args = ['--my-beam-option'] runner = beam_dag_runner.BeamDagRunner( beam_orchestrator_args=beam_orchestrator_args) self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner) self.assertIs(runner._beam_orchestrator_args, beam_orchestrator_args) # Test that the legacy Beam DAG runner is used when both a PipelineConfig # and beam_orchestrator_args are specified. config = pipeline_config.PipelineConfig() beam_orchestrator_args = ['--my-beam-option'] runner = beam_dag_runner.BeamDagRunner( config=config, beam_orchestrator_args=beam_orchestrator_args) self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner) self.assertIs(runner._config, config) self.assertIs(runner._beam_orchestrator_args, beam_orchestrator_args)
def testBeamExecutionNonNullableReturnError(self): """Test failure when None used for non-optional primitive return value.""" instance_1 = _injector_3() # pylint: disable=no-value-for-parameter self.assertEqual(1, len(instance_1.outputs['examples'].get())) instance_2 = _optionalarg_component( # pylint: disable=assignment-from-no-return foo=9, bar='secret', examples=instance_1.outputs['examples'], a=instance_1.outputs['a'], b=instance_1.outputs['b'], c=instance_1.outputs['c'], d=instance_1.outputs['d'], e1=instance_1.outputs['e'], e2=instance_1.outputs['e'], g=999.0, optional_examples_1=instance_1.outputs['examples']) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline_1', pipeline_root=self._test_dir, metadata_connection_config=metadata_config, components=[instance_1, instance_2]) with self.assertRaisesRegex( ValueError, 'Non-nullable output \'e\' received None return value'): beam_dag_runner.BeamDagRunner().run(test_pipeline)
def testBeamExecutionFailure(self): """Test execution with return values; failure case.""" instance_1 = _injector_1(foo=9, bar='secret') instance_2 = _simple_component(a=instance_1.outputs['a'], b=instance_1.outputs['b'], c=instance_1.outputs['c'], d=instance_1.outputs['d']) # Swapped 'e' and 'f'. instance_3 = _verify(e=instance_2.outputs['f'], f=instance_2.outputs['e'], g=instance_2.outputs['g'], h=instance_2.outputs['h']) # pylint: disable=assignment-from-no-return metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline_1', pipeline_root=self._test_dir, metadata_connection_config=metadata_config, components=[instance_1, instance_2, instance_3]) with self.assertRaisesRegex( RuntimeError, r'AssertionError: \(220.0, 32.0, \'OK\', None\)'): beam_dag_runner.BeamDagRunner().run(test_pipeline)
def testRun(self): component_a = _FakeComponent( _FakeComponentSpecA(output=types.Channel(type=_ArtifactTypeA))) component_b = _FakeComponent( _FakeComponentSpecB(a=component_a.outputs['output'], output=types.Channel(type=_ArtifactTypeB))) component_c = _FakeComponent( _FakeComponentSpecC(a=component_a.outputs['output'], output=types.Channel(type=_ArtifactTypeC))) component_c.add_upstream_node(component_b) component_d = _FakeComponent( _FakeComponentSpecD(b=component_b.outputs['output'], c=component_c.outputs['output'], output=types.Channel(type=_ArtifactTypeD))) component_e = _FakeComponent( _FakeComponentSpecE(a=component_a.outputs['output'], b=component_b.outputs['output'], d=component_d.outputs['output'], output=types.Channel(type=_ArtifactTypeE))) test_pipeline = pipeline.Pipeline( pipeline_name='x', pipeline_root='y', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[ component_d, component_c, component_a, component_b, component_e ]) beam_dag_runner.BeamDagRunner().run(test_pipeline) self.assertEqual(_executed_components, [ '_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c', '_FakeComponent.d', '_FakeComponent.e' ])
def testBeamExecutionOptionalInputsAndParameters(self): """Test execution with optional inputs and parameters.""" instance_1 = _injector_2() # pylint: disable=no-value-for-parameter self.assertEqual(1, len(instance_1.outputs['examples'].get())) instance_2 = _optionalarg_component( # pylint: disable=assignment-from-no-return foo=9, bar='secret', examples=instance_1.outputs['examples'], a=instance_1.outputs['a'], b=instance_1.outputs['b'], c=instance_1.outputs['c'], d=instance_1.outputs['d'], e1=instance_1.outputs['e'], e2=instance_1.outputs['e'], g=999.0, optional_examples_1=instance_1.outputs['examples']) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline_1', pipeline_root=self._test_dir, metadata_connection_config=metadata_config, components=[instance_1, instance_2]) beam_dag_runner.BeamDagRunner().run(test_pipeline)
def testRun(self): component_a = _FakeComponent( _FakeComponentSpecA(output=types.Channel(type_name='a'))) component_b = _FakeComponent( _FakeComponentSpecB(a=component_a.outputs.output, output=types.Channel(type_name='b'))) component_c = _FakeComponent( _FakeComponentSpecC(a=component_a.outputs.output, output=types.Channel(type_name='c'))) component_d = _FakeComponent( _FakeComponentSpecD(b=component_b.outputs.output, c=component_c.outputs.output, output=types.Channel(type_name='d'))) component_e = _FakeComponent( _FakeComponentSpecE(a=component_a.outputs.output, b=component_b.outputs.output, d=component_d.outputs.output, output=types.Channel(type_name='e'))) test_pipeline = pipeline.Pipeline( pipeline_name='x', pipeline_root='y', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[ component_d, component_c, component_a, component_b, component_e ]) beam_dag_runner.BeamDagRunner().run(test_pipeline) self.assertItemsEqual(_executed_components, [ '_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c', '_FakeComponent.d', '_FakeComponent.e' ]) self.assertEqual(_executed_components[0], '_FakeComponent.a') self.assertEqual(_executed_components[3], '_FakeComponent.d') self.assertEqual(_executed_components[4], '_FakeComponent.e')
def testPatcher(self, mock_run): patcher = beam_dag_runner_patcher.BeamDagRunnerPatcher() with patcher.patch() as context: beam_dag_runner.BeamDagRunner().run( tfx_pipeline.Pipeline(_PIPELINE_NAME, '')) mock_run.assert_not_called() self.assertEqual(context[patcher.PIPELINE_NAME], _PIPELINE_NAME)
def testPartialRunWithIntermediateDeploymentConfig(self): self._pipeline.deployment_config.Pack(_INTERMEDIATE_DEPLOYMENT_CONFIG) pr_opts = pipeline_pb2.PartialRun() pr_opts.from_nodes.append('my_trainer') pr_opts.to_nodes.append('my_trainer') pr_opts.snapshot_settings.latest_pipeline_run_strategy.SetInParent() beam_dag_runner.BeamDagRunner().run_with_ir( self._pipeline, run_options=pipeline_pb2.RunOptions(partial_run=pr_opts)) self.assertEqual(_executed_components, ['my_trainer'])
def run_pipeline(self, components: List[base_component.BaseComponent]) -> None: """Creates and runs a pipeline with the given components.""" runner = beam_dag_runner.BeamDagRunner() runner.run( pipeline.Pipeline(pipeline_name=self.pipeline_name, pipeline_root=self.pipeline_root, metadata_connection_config=self.metadata_config, beam_pipeline_args=[], components=components))
def testDockerComponentLauncherInBeam(self): beam_dag_runner.BeamDagRunner().run( _create_pipeline(pipeline_name=self._pipeline_name, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, name='docker_e2e_test_in_beam')) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: self.assertEqual(1, len(m.store.get_executions()))
def testRunWithIntermediateDeploymentConfig(self): self._pipeline.deployment_config.Pack(_INTERMEDIATE_DEPLOYMENT_CONFIG) beam_dag_runner.BeamDagRunner().run(self._pipeline) self.assertEqual( _component_executors, { 'my_example_gen': text_format.Parse( 'class_path: "tfx.components.example_gen_executor"', _PythonClassExecutableSpec()), 'my_transform': text_format.Parse( 'class_path: "tfx.components.transform_executor"', _PythonClassExecutableSpec()), 'my_trainer': text_format.Parse('image: "path/to/docker/image"', _ContainerExecutableSpec()), 'my_importer': None, }) self.assertEqual( _component_drivers, { 'my_example_gen': text_format.Parse( 'class_path: "tfx.components.example_gen_driver"', _PythonClassExecutableSpec()), 'my_transform': None, 'my_trainer': None, 'my_importer': None, }) self.assertEqual( _component_platform_configs, { 'my_example_gen': None, 'my_transform': None, 'my_trainer': text_format.Parse('docker_server_url: "docker/server/url"', _DockerPlatformConfig()), 'my_importer': None, }) # 'my_importer' has no upstream and can be executed in any order. self.assertIn('my_importer', _executed_components) _executed_components.remove('my_importer') self.assertEqual(_executed_components, ['my_example_gen', 'my_transform', 'my_trainer']) # Verifies that every component gets a not-None pipeline_run. self.assertTrue(all(_conponent_to_pipeline_run.values()))
def testRun(self): component_a = _FakeComponent( _FakeComponentSpecA(output=types.Channel(type=_ArtifactTypeA)), enable_cache=True) component_b = _FakeComponent(_FakeComponentSpecB( a=component_a.outputs['output'], output=types.Channel(type=_ArtifactTypeB)), enable_cache=False) component_c = _FakeComponent( _FakeComponentSpecC(a=component_a.outputs['output'], output=types.Channel(type=_ArtifactTypeC)), True) component_d = _FakeComponent( _FakeComponentSpecD(b=component_b.outputs['output'], c=component_c.outputs['output'], output=types.Channel(type=_ArtifactTypeD)), False) component_e = _FakeComponent( _FakeComponentSpecE(a=component_a.outputs['output'], b=component_b.outputs['output'], d=component_d.outputs['output'], output=types.Channel(type=_ArtifactTypeE))) test_pipeline = pipeline.Pipeline( pipeline_name='x', pipeline_root='y', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[ component_d, component_c, component_a, component_b, component_e ]) beam_dag_runner.BeamDagRunner().run(test_pipeline) self.assertCountEqual(_executed_components, [ '_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c', '_FakeComponent.d', '_FakeComponent.e' ]) self.assertEqual(_executed_components[0], '_FakeComponent.a') self.assertEqual(_executed_components[3], '_FakeComponent.d') self.assertEqual(_executed_components[4], '_FakeComponent.e') self.assertDictEqual( { '_FakeComponent.a': True, '_FakeComponent.b': False, '_FakeComponent.c': True, '_FakeComponent.d': False, '_FakeComponent.e': False, }, _executed_components_cached)
def testDockerComponentLauncherInBeam(self): beam_dag_runner.BeamDagRunner(config=pipeline_config.PipelineConfig( supported_launcher_classes=[ docker_component_launcher.DockerComponentLauncher ], default_component_configs=[ docker_component_config.DockerComponentConfig() ])).run( _create_pipeline(pipeline_name=self._pipeline_name, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, name='docker_e2e_test_in_beam')) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) with metadata.Metadata(metadata_config) as m: self.assertEqual(1, len(m.store.get_executions()))
def testBeamExecutionSuccess(self): """Test execution with return values; success case.""" instance_1 = _injector_1(foo=9, bar='secret') instance_2 = _simple_component(a=instance_1.outputs['a'], b=instance_1.outputs['b'], c=instance_1.outputs['c'], d=instance_1.outputs['d']) instance_3 = _verify(e=instance_2.outputs['e'], f=instance_2.outputs['f']) # pylint: disable=assignment-from-no-return metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline_1', pipeline_root=self._test_dir, metadata_connection_config=metadata_config, components=[instance_1, instance_2, instance_3]) beam_dag_runner.BeamDagRunner().run(test_pipeline)
def run_benchmarks(self, benchmarks: List[nitroml.Benchmark], **kwargs) -> None: """Runs the given benchmarks with nitroml using a BeamDagRunner. Args: benchmarks: List of `nitroml.Benchmark` to run. **kwargs: Keyword args to pass to `nitroml#run`. """ nitroml.run(benchmarks, pipeline_name=kwargs.pop("pipeline_name", self.pipeline_name), pipeline_root=kwargs.pop("pipeline_root", self.pipeline_root), metadata_connection_config=kwargs.pop( "metadata_connection_config", self.metadata_config), tfx_runner=kwargs.pop("tfx_runner", beam_dag_runner.BeamDagRunner()), beam_pipeline_args=kwargs.pop("beam_pipeline_args", []), **kwargs)
def testComponentAnnotation(self): """Test component annotation parsed from decorator param.""" instance_1 = _injector_1_with_annotation(foo=9, bar='secret') instance_2 = _simple_component_with_annotation( a=instance_1.outputs['a'], b=instance_1.outputs['b'], c=instance_1.outputs['c'], d=instance_1.outputs['d']) instance_3 = _verify_with_annotation(e=instance_2.outputs['e'], f=instance_2.outputs['f'], g=instance_2.outputs['g'], h=instance_2.outputs['h']) # pylint: disable=assignment-from-no-return metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline_1', pipeline_root=self._test_dir, metadata_connection_config=metadata_config, components=[instance_1, instance_2, instance_3]) beam_dag_runner.BeamDagRunner().run(test_pipeline) # Verify base_type annotation parsed from component decorator is correct. self.assertEqual(test_pipeline.components[0].type, '__main__._injector_1_with_annotation') self.assertEqual( test_pipeline.components[0].type_annotation.MLMD_SYSTEM_BASE_TYPE, 1) self.assertEqual(test_pipeline.components[1].type, '__main__._simple_component_with_annotation') self.assertEqual( test_pipeline.components[1].type_annotation.MLMD_SYSTEM_BASE_TYPE, 2) self.assertEqual(test_pipeline.components[2].type, '__main__._verify_with_annotation') self.assertEqual( test_pipeline.components[2].type_annotation.MLMD_SYSTEM_BASE_TYPE, 3)
def run(benchmarks: List[Benchmark], tfx_runner: Optional[tfx_runner_lib.TfxRunner] = None, pipeline_name: Optional[str] = None, pipeline_root: Optional[str] = None, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, enable_cache: Optional[bool] = False, beam_pipeline_args: Optional[List[str]] = None, **kwargs) -> BenchmarkPipeline: """Runs the given benchmarks as part of a single pipeline DAG. First it concatenates all the benchmark pipelines into a single DAG benchmark pipeline. Next it executes the workflow via tfx_runner.run(). When the `match` flag is set, matched benchmarks are filtered by name. When the `runs_per_benchmark` flag is set, each benchmark is run the number of times specified. Args: benchmarks: List of Benchmark instances to include in the suite. tfx_runner: The TfxRunner instance that defines the platform where benchmarks are run. pipeline_name: Name of the benchmark pipeline. pipeline_root: Path to root directory of the pipeline. metadata_connection_config: The config to connect to ML metadata. enable_cache: Whether or not cache is enabled for this run. beam_pipeline_args: Beam pipeline args for beam jobs within executor. Executor will use beam DirectRunner as Default. **kwargs: Additional kwargs forwarded as kwargs to benchmarks. Returns: Returns the BenchmarkPipeline that was passed to the tfx_runner. Raises: ValueError: If the given tfx_runner is not supported. """ if "compile_pipeline" in kwargs: kwargs.pop("compile_pipeline") logging.warning( "The `compile_pipeline` argument DEPRECATED and ignored. " "Pipelines are now automatically compiled.") runs_per_benchmark = FLAGS.runs_per_benchmark if runs_per_benchmark is None: runs_per_benchmark = int( os.environ.get("NITROML_RUNS_PER_BENCHMARK", 1)) if not tfx_runner: logging.info("Setting TFX runner to OSS default: BeamDagRunner.") tfx_runner = beam_dag_runner.BeamDagRunner() if runs_per_benchmark <= 0: raise ValueError( "runs_per_benchmark must be strictly positive; " f"got runs_per_benchmark={runs_per_benchmark} instead.") benchmark_subpipelines = [] for b in benchmarks: for benchmark_run in range(runs_per_benchmark): # Call benchmarks with pipeline args. spec = b(benchmark_run=benchmark_run + 1, runs_per_benchmark=runs_per_benchmark, **kwargs) for benchmark_subpipeline in spec.benchmark_subpipelines: if re.match(FLAGS.match, benchmark_subpipeline.id): benchmark_subpipelines.append(benchmark_subpipeline) if FLAGS.match and not benchmark_subpipelines: if spec.components_to_always_add: logging.info( "No benchmarks matched the pattern '%s'. " "Running components passed to self.add(..., always=True) only.", FLAGS.match) else: raise ValueError( f"No benchmarks matched the pattern '{FLAGS.match}'") benchmark_pipeline = BenchmarkPipeline( components_to_always_add=spec.components_to_always_add, benchmark_subpipelines=benchmark_subpipelines, pipeline_name=pipeline_name, pipeline_root=pipeline_root, metadata_connection_config=metadata_connection_config, enable_cache=enable_cache, beam_pipeline_args=beam_pipeline_args, **kwargs) logging.info("NitroML benchmarks:") for benchmark_name in benchmark_pipeline.benchmark_names: logging.info("\t%s", benchmark_name) logging.info("\t\tRUNNING") dsl_compiler = compiler.Compiler() pipeline_to_run = dsl_compiler.compile(benchmark_pipeline) if spec.requested_partial_run: logging.info("Only running the following nodes:\n%s", "\n".join(spec.nodes_to_partial_run)) pipeline_to_run = pipeline_filtering.filter_pipeline( input_pipeline=pipeline_to_run, pipeline_run_id_fn=( pipeline_filtering.make_latest_resolver_pipeline_run_id_fn( benchmark_pipeline.metadata_connection_config)), skip_nodes=lambda x: x not in set(spec.nodes_to_partial_run)) tfx_runner.run(pipeline_to_run) return benchmark_pipeline
def run(benchmarks: List[Benchmark], tfx_runner: Optional[tfx_runner_lib.TfxRunner] = None, pipeline_name: Optional[Text] = None, pipeline_root: Optional[Text] = None, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, enable_cache: Optional[bool] = False, beam_pipeline_args: Optional[List[Text]] = None, **kwargs) -> List[Text]: """Runs the given benchmarks as part of a single pipeline DAG. First it concatenates all the benchmark pipelines into a single DAG benchmark pipeline. Next it executes the workflow via tfx_runner.run(). When the `match` flag is set, matched benchmarks are filtered by name. When the `runs_per_benchmark` flag is set, each benchmark is run the number of times specified. Args: benchmarks: List of Benchmark instances to include in the suite. tfx_runner: The TfxRunner instance that defines the platform where benchmarks are run. pipeline_name: Name of the benchmark pipeline. pipeline_root: Path to root directory of the pipeline. metadata_connection_config: The config to connect to ML metadata. enable_cache: Whether or not cache is enabled for this run. beam_pipeline_args: Beam pipeline args for beam jobs within executor. Executor will use beam DirectRunner as Default. **kwargs: Additional kwargs forwarded as kwargs to benchmarks. Returns: The string list of benchmark names that were included in this run. Raises: ValueError: If the given tfx_runner is not supported. """ runs_per_benchmark = FLAGS.runs_per_benchmark if runs_per_benchmark is None: runs_per_benchmark = int( os.environ.get("NITROML_RUNS_PER_BENCHMARK", 1)) if not tfx_runner: logging.info("Setting TFX runner to OSS default: BeamDagRunner.") tfx_runner = beam_dag_runner.BeamDagRunner() if runs_per_benchmark <= 0: raise ValueError( "runs_per_benchmark must be strictly positive; " f"got runs_per_benchmark={runs_per_benchmark} instead.") pipelines = [] for b in benchmarks: for benchmark_run in range(runs_per_benchmark): # Call benchmarks with pipeline args. result = b(**kwargs) for pipeline in result.pipelines: if re.match(FLAGS.match, pipeline.benchmark_name): pipelines.append( _RepeatablePipeline( pipeline, repetition=benchmark_run + 1, # One-index runs. num_repetitions=runs_per_benchmark, add_publisher=pipeline.evaluator is not None)) pipeline_builder = _ConcatenatedPipelineBuilder(pipelines) benchmark_pipeline = pipeline_builder.build( pipeline_name=pipeline_name, pipeline_root=pipeline_root, metadata_connection_config=metadata_connection_config, enable_cache=enable_cache, beam_pipeline_args=beam_pipeline_args, **kwargs) tfx_runner.run(benchmark_pipeline) return pipeline_builder.benchmark_names