Ejemplo n.º 1
0
    def testLegacyBeamDagRunnerConstruction(self):
        self.assertIsInstance(beam_dag_runner.BeamDagRunner(),
                              beam_dag_runner.BeamDagRunner)

        # Test that the legacy Beam DAG runner is used when a PipelineConfig is
        # specified.
        config = pipeline_config.PipelineConfig()
        runner = beam_dag_runner.BeamDagRunner(config=config)
        self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner)
        self.assertIs(runner._config, config)

        # Test that the legacy Beam DAG runner is used when beam_orchestrator_args
        # is specified.
        beam_orchestrator_args = ['--my-beam-option']
        runner = beam_dag_runner.BeamDagRunner(
            beam_orchestrator_args=beam_orchestrator_args)
        self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner)
        self.assertIs(runner._beam_orchestrator_args, beam_orchestrator_args)

        # Test that the legacy Beam DAG runner is used when both a PipelineConfig
        # and beam_orchestrator_args are specified.
        config = pipeline_config.PipelineConfig()
        beam_orchestrator_args = ['--my-beam-option']
        runner = beam_dag_runner.BeamDagRunner(
            config=config, beam_orchestrator_args=beam_orchestrator_args)
        self.assertIs(runner.__class__, legacy_beam_dag_runner.BeamDagRunner)
        self.assertIs(runner._config, config)
        self.assertIs(runner._beam_orchestrator_args, beam_orchestrator_args)
Ejemplo n.º 2
0
 def testInitSucceed(self):
     # Init with default parameters
     pipeline_config.PipelineConfig()
     # Init with custom parameters
     pipeline_config.PipelineConfig(
         supported_launcher_classes=[
             in_process_component_launcher.InProcessComponentLauncher
         ],
         default_component_configs=[
             docker_component_config.DockerComponentConfig()
         ],
         component_config_overrides={
             'comp-1',
             docker_component_config.DockerComponentConfig()
         })
Ejemplo n.º 3
0
    def testFindComponentLaunchInfoReturnConfigOverride(self):
        input_artifact = test_utils._InputArtifact()
        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]),
            custom_executor_spec=executor_spec.ExecutorContainerSpec(
                image='gcr://test', args=['{{input_dict["input"][0].uri}}']))
        default_config = docker_component_config.DockerComponentConfig()
        override_config = docker_component_config.DockerComponentConfig(
            name='test')
        p_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                docker_component_launcher.DockerComponentLauncher
            ],
            default_component_configs=[default_config],
            component_config_overrides={
                '_FakeComponent.FakeComponent': override_config
            })

        (launcher_class, c_config) = config_utils.find_component_launch_info(
            p_config, component)

        self.assertEqual(docker_component_launcher.DockerComponentLauncher,
                         launcher_class)
        self.assertEqual(override_config, c_config)
Ejemplo n.º 4
0
 def testNoSupportedLaunchers(self):
   config = pipeline_config.PipelineConfig(
       supported_launcher_classes=[
           docker_component_launcher.DockerComponentLauncher])
   runner = local_dag_runner.LocalDagRunner(config=config)
   with self.assertRaisesRegex(RuntimeError, 'No launcher info can be found'):
     runner.run(self._getTestPipeline())
Ejemplo n.º 5
0
  def __init__(self, config: Optional[pipeline_config.PipelineConfig] = None):
    """Initializes a TfxRunner instance.

    Args:
      config: Optional pipeline config for customizing the launching
        of each component.
    """
    self._config = config or pipeline_config.PipelineConfig()
Ejemplo n.º 6
0
    def testFindComponentLaunchInfoFailWithNoLauncherClassFound(self):
        input_artifact = test_utils._InputArtifact()
        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]))
        p_config = pipeline_config.PipelineConfig(supported_launcher_classes=[
            docker_component_launcher.DockerComponentLauncher
        ])

        with self.assertRaises(RuntimeError):
            # DockerComponentLauncher cannot launch class executor.
            config_utils.find_component_launch_info(p_config, component)
Ejemplo n.º 7
0
  def testFindComponentLaunchInfoReturnDefaultLaunchInfo(self):
    input_artifact = types.Artifact(type_name='InputPath')
    component = test_utils._FakeComponent(
        name='FakeComponent',
        input_channel=channel_utils.as_channel([input_artifact]))
    p_config = pipeline_config.PipelineConfig()

    (launcher_class,
     c_config) = config_utils.find_component_launch_info(p_config, component)

    self.assertEqual(in_process_component_launcher.InProcessComponentLauncher,
                     launcher_class)
    self.assertIsNone(c_config)
Ejemplo n.º 8
0
    def __init__(self,
                 config: Optional[pipeline_config.PipelineConfig] = None):
        """Initializes local TFX orchestrator.

    Args:
      config: Optional pipeline config for customizing the launching of each
        component. Defaults to pipeline config that supports
        InProcessComponentLauncher and DockerComponentLauncher.
    """
        if config is None:
            config = pipeline_config.PipelineConfig(
                supported_launcher_classes=[
                    in_process_component_launcher.InProcessComponentLauncher,
                    docker_component_launcher.DockerComponentLauncher,
                ], )
        super(LocalDagRunner, self).__init__(config)
Ejemplo n.º 9
0
    def testDockerComponentLauncherInBeam(self):

        beam_dag_runner.BeamDagRunner(config=pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                docker_component_launcher.DockerComponentLauncher
            ],
            default_component_configs=[
                docker_component_config.DockerComponentConfig()
            ])).run(
                _create_pipeline(pipeline_name=self._pipeline_name,
                                 pipeline_root=self._pipeline_root,
                                 metadata_path=self._metadata_path,
                                 name='docker_e2e_test_in_beam'))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            self.assertEqual(1, len(m.store.get_executions()))
Ejemplo n.º 10
0
    def __init__(self,
                 beam_orchestrator_args: Optional[List[Text]] = None,
                 config: Optional[pipeline_config.PipelineConfig] = None):
        """Initializes BeamDagRunner as a TFX orchestrator.

    Args:
      beam_orchestrator_args: beam args for the beam orchestrator. Note that
        this is different from the beam_pipeline_args within
        additional_pipeline_args, which is for beam pipelines in components.
      config: Optional pipeline config for customizing the launching of each
        component. Defaults to pipeline config that supports
        InProcessComponentLauncher and DockerComponentLauncher.
    """
        if config is None:
            config = pipeline_config.PipelineConfig(
                supported_launcher_classes=[
                    in_process_component_launcher.InProcessComponentLauncher,
                    docker_component_launcher.DockerComponentLauncher,
                ], )
        super(BeamDagRunner, self).__init__(config)
        self._beam_orchestrator_args = beam_orchestrator_args
Ejemplo n.º 11
0
    def testStubbedTaxiPipelineBeam(self):
        # Run pipeline with stub executors.
        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self._recorded_output_dir, test_component_ids=[])

        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_component_launcher.StubComponentLauncher,
            ])
        local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run(
            self.taxi_pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 3 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 3)
            self.assertLen(self.taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id.startswith('ResolverNode'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        local_dag_runner.LocalDagRunner().run(self.taxi_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.taxi_pipeline.metadata_connection_config,
            self.taxi_pipeline.pipeline_info)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation
        }

        # List of components to verify. ResolverNode is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.taxi_pipeline.components
            if not component.id.startswith('ResolverNode')
        ]

        for component_id in verify_component_ids:
            logging.info('Verifying %s', component_id)
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
    def testStubbedImdbPipelineBeam(self):
        # Runs the pipeline and record to self._recorded_output_dir
        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self._recorded_output_dir, test_component_ids=[])

        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_component_launcher.StubComponentLauncher,
            ])
        BeamDagRunner(config=stub_pipeline_config).run(self.imdb_pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            for execution in m.store.get_executions():
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id.startswith('ResolverNode'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    assert steps[0].HasField('key')
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        BeamDagRunner().run(self.imdb_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.imdb_pipeline.metadata_connection_config,
            self.imdb_pipeline.pipeline_info)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation
        }

        # List of components to verify. ResolverNode is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.imdb_pipeline.components
            if not component.id.startswith('ResolverNode')
        ]

        for component_id in verify_component_ids:
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    logging.info('Verifying %s', component_id)
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
    def testTaxiPipelineBeam(self):
        # Runs the pipeline and record to self._recorded_output_dir
        record_taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._recorded_mlmd_path,
            beam_pipeline_args=[])
        BeamDagRunner().run(record_taxi_pipeline)
        pipeline_recorder_utils.record_pipeline(
            output_dir=self._recorded_output_dir,
            metadata_db_uri=self._recorded_mlmd_path,
            host=None,
            port=None,
            pipeline_name=self._pipeline_name,
            run_id=None)

        # Run pipeline with stub executors.
        taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[])

        model_resolver_id = 'ResolverNode.latest_blessed_model_resolver'
        stubbed_component_ids = [
            component.id for component in taxi_pipeline.components
            if component.id != model_resolver_id
        ]

        stub_launcher = stub_component_launcher.get_stub_launcher_class(
            test_data_dir=self._recorded_output_dir,
            stubbed_component_ids=stubbed_component_ids,
            stubbed_component_map={})
        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_launcher,
            ])
        BeamDagRunner(config=stub_pipeline_config).run(taxi_pipeline)

        self.assertTrue(tf.io.gfile.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 3 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 3)
            self.assertLen(taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id == 'ResolverNode.latest_blessed_model_resolver':
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))
Ejemplo n.º 14
0
 def testInitFailWithDupDefaultComponentConfigClasses(self):
     with self.assertRaises(ValueError):
         pipeline_config.PipelineConfig(default_component_configs=[
             docker_component_config.DockerComponentConfig(),
             docker_component_config.DockerComponentConfig(),
         ])
Ejemplo n.º 15
0
 def testInitFailWithDupLauncherClasses(self):
     with self.assertRaises(ValueError):
         pipeline_config.PipelineConfig(supported_launcher_classes=[
             in_process_component_launcher.InProcessComponentLauncher,
             in_process_component_launcher.InProcessComponentLauncher,
         ])