Beispiel #1
0
  def testSimplePipelineRun(self):
    self.assertEqual(self.RAN_COMPONENTS, [])

    # Construct component instances.
    dummy_load_component = LoadDummyDatasetComponent()
    dummy_train_component = DummyTrainComponent(
        training_data=dummy_load_component.outputs['dataset'], num_iterations=5)
    dummy_validate_component = DummyValidateComponent(
        model=dummy_train_component.outputs['model'],
        loss=dummy_train_component.outputs['loss'],
        accuracy=dummy_train_component.outputs['accuracy'])

    # Construct and run pipeline
    temp_path = tempfile.mkdtemp()
    pipeline_root_path = os.path.join(temp_path, 'pipeline_root')
    metadata_path = os.path.join(temp_path, 'metadata.db')
    test_pipeline = pipeline.Pipeline(
        pipeline_name='test_pipeline',
        pipeline_root=pipeline_root_path,
        metadata_connection_config=sqlite_metadata_connection_config(
            metadata_path),
        components=[
            dummy_load_component,
            dummy_train_component,
            dummy_validate_component,
        ])
    local_dag_runner.LocalDagRunner().run(test_pipeline)

    self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train', 'Validate'])
 def testPatcher(self, mock_run):
     patcher = local_dag_runner_patcher.LocalDagRunnerPatcher()
     with patcher.patch() as context:
         local_dag_runner.LocalDagRunner().run(
             tfx_pipeline.Pipeline(_PIPELINE_NAME, ''))
         mock_run.assert_not_called()
         self.assertEqual(context[patcher.PIPELINE_NAME], _PIPELINE_NAME)
Beispiel #3
0
 def testRunWithIR(self):
     local_dag_runner.LocalDagRunner().run_with_ir(
         self._getTestPipelineIR())
     self.assertEqual(_executed_components, [
         '_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c',
         '_FakeComponent.d', '_FakeComponent.e'
     ])
Beispiel #4
0
 def testPartialRun(self):
     local_dag_runner.LocalDagRunner().run(
         self._getTestPipeline(),
         run_options=pipeline_py.RunOptions(to_nodes=['c']))
     self.assertEqual(
         _executed_components,
         ['_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c'])
Beispiel #5
0
 def testNoSupportedLaunchers(self):
     config = pipeline_config.PipelineConfig(supported_launcher_classes=[
         docker_component_launcher.DockerComponentLauncher
     ])
     runner = local_dag_runner.LocalDagRunner(config=config)
     with self.assertRaisesRegex(RuntimeError,
                                 'No launcher info can be found'):
         runner.run(self._getTestPipeline())
Beispiel #6
0
  def testSimplePipelinePartialRun(self):
    self.assertEqual(self.RAN_COMPONENTS, [])

    local_dag_runner.LocalDagRunner().run(
        self._getTestPipeline(),
        run_options=pipeline_py.RunOptions(to_nodes=['Train']))

    self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train'])
Beispiel #7
0
 def testPartialRunWithIR(self):
     pr_opts = pipeline_pb2.PartialRun()
     pr_opts.to_nodes.append('c')
     pr_opts.snapshot_settings.latest_pipeline_run_strategy.SetInParent()
     local_dag_runner.LocalDagRunner().run_with_ir(
         self._getTestPipelineIR(),
         run_options=pipeline_pb2.RunOptions(partial_run=pr_opts))
     self.assertEqual(
         _executed_components,
         ['_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c'])
Beispiel #8
0
  def testSimplePipelinePartialRunWithIR(self):
    self.assertEqual(self.RAN_COMPONENTS, [])

    pr_opts = pipeline_pb2.PartialRun()
    pr_opts.to_nodes.append('Train')
    pr_opts.snapshot_settings.latest_pipeline_run_strategy.SetInParent()
    local_dag_runner.LocalDagRunner().run_with_ir(
        self._getTestPipelineIR(),
        run_options=pipeline_pb2.RunOptions(partial_run=pr_opts))

    self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train'])
Beispiel #9
0
    def setUp(self):
        super(TaxiPipelineRegressionEndToEndTest, self).setUp()
        self._test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        self._pipeline_name = 'beam_stub_test'
        # This example assumes that the taxi data and taxi utility function are
        # stored in tfx/examples/chicago_taxi_pipeline. Feel free to customize this
        # as needed.
        taxi_root = os.path.dirname(taxi_pipeline_beam.__file__)
        self._data_root = os.path.join(taxi_root, 'data', 'simple')
        self._module_file = os.path.join(taxi_root, 'taxi_utils.py')
        self._serving_model_dir = os.path.join(self._test_dir, 'serving_model')
        self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines',
                                           self._pipeline_name)
        # Metadata path for recording successful pipeline run.
        self._recorded_mlmd_path = os.path.join(self._test_dir, 'tfx',
                                                'record', 'metadata.db')
        # Metadata path for stub pipeline runs.
        self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata',
                                           self._pipeline_name, 'metadata.db')
        self._recorded_output_dir = os.path.join(self._test_dir, 'testdata')

        # Runs the pipeline and record to self._recorded_output_dir
        record_taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._recorded_mlmd_path,
            beam_pipeline_args=[])

        local_dag_runner.LocalDagRunner().run(record_taxi_pipeline)

        pipeline_recorder_utils.record_pipeline(
            output_dir=self._recorded_output_dir,
            metadata_db_uri=self._recorded_mlmd_path,
            pipeline_name=self._pipeline_name)

        self.taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[])
Beispiel #10
0
    def setUp(self):
        super(ImdbStubPipelineRegressionEndToEndTest, self).setUp()
        self._test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        self._pipeline_name = 'imdb_stub_test'
        # This example assumes that the imdb data and imdb utility function are
        # stored in tfx/examples/imdb. Feel free to customize this as needed.
        imdb_root = os.path.dirname(imdb_pipeline_native_keras.__file__)
        self._data_root = os.path.join(imdb_root, 'data')
        self._module_file = os.path.join(imdb_root,
                                         'imdb_utils_native_keras.py')
        self._serving_model_dir = os.path.join(self._test_dir, 'serving_model')
        self._pipeline_root = os.path.join(self._test_dir, 'pipelines',
                                           self._pipeline_name)
        # Metadata path for recording successful pipeline run.
        self._recorded_mlmd_path = os.path.join(self._test_dir, 'record',
                                                'metadata.db')
        # Metadata path for stub pipeline
        self._metadata_path = os.path.join(self._test_dir, 'metadata',
                                           self._pipeline_name, 'metadata.db')
        self._recorded_output_dir = os.path.join(self._test_dir, 'testdata')

        record_imdb_pipeline = imdb_pipeline_native_keras._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._recorded_mlmd_path,
            beam_pipeline_args=[])

        local_dag_runner.LocalDagRunner().run(record_imdb_pipeline)

        pipeline_recorder_utils.record_pipeline(
            output_dir=self._recorded_output_dir,
            metadata_db_uri=self._recorded_mlmd_path,
            pipeline_name=self._pipeline_name)

        # Run pipeline with stub executors.
        self.imdb_pipeline = imdb_pipeline_native_keras._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[])
Beispiel #11
0
def run():
    """Define a pipeline."""

    local_dag_runner.LocalDagRunner().run(
        pipeline.create_pipeline(
            pipeline_name=configs.PIPELINE_NAME,
            pipeline_root=PIPELINE_ROOT,
            data_path=DATA_PATH,
            # NOTE: Use `query` instead of `data_path` to use BigQueryExampleGen.
            # query=configs.BIG_QUERY_QUERY,
            preprocessing_fn=configs.PREPROCESSING_FN,
            run_fn=configs.RUN_FN,
            train_args=trainer_pb2.TrainArgs(
                num_steps=configs.TRAIN_NUM_STEPS),
            eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS),
            eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD,
            serving_model_dir=SERVING_MODEL_DIR,
            # NOTE: Provide GCP configs to use BigQuery with Beam DirectRunner.
            # beam_pipeline_args=configs.
            # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS,
            metadata_connection_config=metadata.
            sqlite_metadata_connection_config(METADATA_PATH)))
Beispiel #12
0
    def testStubbedTaxiPipelineBeam(self):
        # Run pipeline with stub executors.
        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self._recorded_output_dir, test_component_ids=[])

        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_component_launcher.StubComponentLauncher,
            ])
        local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run(
            self.taxi_pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 3 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 3)
            self.assertLen(self.taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id.startswith('ResolverNode'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        local_dag_runner.LocalDagRunner().run(self.taxi_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.taxi_pipeline.metadata_connection_config,
            self.taxi_pipeline.pipeline_info)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation
        }

        # List of components to verify. ResolverNode is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.taxi_pipeline.components
            if not component.id.startswith('ResolverNode')
        ]

        for component_id in verify_component_ids:
            logging.info('Verifying %s', component_id)
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
        f"--direct_num_workers={direct_num_workers}",
        f"--direct_running_mode=multi_processing",
    ]
    tfx_pipeline = pipeline.Pipeline(
        pipeline_name=config.PIPELINE_NAME,
        pipeline_root=config.PIPELINE_ROOT,
        components=components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            config.METADATA_PATH),
        beam_pipeline_args=beam_arg,
    )
    return tfx_pipeline


# %%
if __name__ == "__main__":
    tfx_components = init_components(
        config.DATA_DIR_PATH,
        config.MODULE_FILE_PATH,
        config.SERVING_MODEL_DIR,
    )
    # %%
    tfx_pipeline = init_pipeline(tfx_components, config.PIPELINE_ROOT, 4)

    # %%
    #the localDagRunner() doesn't work in ipykernel, so you would have to run
    # this in terminal
    #or you have to run context.run(component) within ipykernel
    local_dag_runner.LocalDagRunner().run(tfx_pipeline)
Beispiel #14
0
_metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name,
                              'metadata.db')


def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )


if __name__ == '__main__':
    absl.logging.set_verbosity(absl.logging.INFO)
    local_dag_runner.LocalDagRunner().run(
        _create_pipeline(pipeline_name=_pipeline_name,
                         pipeline_root=_pipeline_root,
                         data_root=_data_root,
                         metadata_path=_metadata_path))
Beispiel #15
0
# Path to a SQLite DB file to use as an MLMD storage.
METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')
# Output directory where created models from the pipeline will be exported.
SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

from absl import logging

logging.set_verbosity(logging.INFO)  # Set default logging level.

import urllib.request
import tempfile

DATA_ROOT = tempfile.mkdtemp(
    prefix='tfx-data')  # Create a temporary directory.
_data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/penguins_processed.csv'
_data_filepath = os.path.join(DATA_ROOT, "data.csv")
urllib.request.urlretrieve(_data_url, _data_filepath)

_trainer_module_file = 'penguin_trainer.py'

from tfx.orchestration.local import local_dag_runner
from pipeline import _create_pipeline

local_dag_runner.LocalDagRunner().run(
    _create_pipeline(pipeline_name=PIPELINE_NAME,
                     pipeline_root=PIPELINE_ROOT,
                     data_root=DATA_ROOT,
                     module_file=_trainer_module_file,
                     serving_model_dir=SERVING_MODEL_DIR,
                     metadata_path=METADATA_PATH))
Beispiel #16
0
  def testSimplePipelineRunWithIR(self):
    self.assertEqual(self.RAN_COMPONENTS, [])

    local_dag_runner.LocalDagRunner().run_with_ir(self._getTestPipelineIR())

    self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train', 'Validate'])