Example #1
0
 def testAIPlatformDistributedTunerPipeline(self):
     """Tuner-only pipeline for distributed Tuner flock on AIP Training."""
     pipeline_name = 'kubeflow-aip-dist-tuner-test-{}'.format(
         test_utils.random_id())
     pipeline = self._create_pipeline(
         pipeline_name,
         [
             self.iris_examples_importer,
             self.iris_schema_importer,
             ai_platform_tuner_component.Tuner(
                 examples=self.iris_examples_importer.outputs['result'],
                 module_file=self._iris_tuner_module,
                 schema=self.iris_schema_importer.outputs['result'],
                 train_args=trainer_pb2.TrainArgs(num_steps=10),
                 eval_args=trainer_pb2.EvalArgs(num_steps=5),
                 # 3 worker parallel tuning.
                 tune_args=tuner_pb2.TuneArgs(num_parallel_trials=3),
                 custom_config={
                     ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                     self._getCaipTrainingArgs(pipeline_name)
                 })
         ])
     self._compile_and_run_pipeline(pipeline)
     self._assertHyperparametersAreWritten(pipeline_name)
 def testAIPlatformTrainerPipeline(self):
     """Trainer-only test pipeline on AI Platform Training."""
     pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(
         test_utils.random_id())
     pipeline = self._create_pipeline(pipeline_name, [
         self.schema_importer, self.transformed_examples_importer,
         self.transform_graph_importer,
         Trainer(custom_executor_spec=executor_spec.ExecutorClassSpec(
             ai_platform_trainer_executor.Executor),
                 module_file=self._trainer_module,
                 transformed_examples=self.transformed_examples_importer.
                 outputs['result'],
                 schema=self.schema_importer.outputs['result'],
                 transform_graph=self.transform_graph_importer.
                 outputs['result'],
                 train_args=trainer_pb2.TrainArgs(num_steps=10),
                 eval_args=trainer_pb2.EvalArgs(num_steps=5),
                 custom_config={
                     ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                     self.getCaipTrainingArgsForDistributed(pipeline_name)
                 })
     ])
     self._compile_and_run_pipeline(pipeline)
     self.assertNumberOfTrainerOutputIsOne(pipeline_name)
Example #3
0
 def testAIPlatformGenericTrainerPipeline(self):
     """Trainer-only pipeline on AI Platform Training with GenericTrainer."""
     pipeline_name = self._make_unique_pipeline_name(
         'kubeflow-aip-generic-trainer')
     pipeline = self._create_pipeline(pipeline_name, [
         self.schema_importer, self.transformed_examples_importer,
         self.transform_graph_importer,
         Trainer(custom_executor_spec=executor_spec.ExecutorClassSpec(
             ai_platform_trainer_executor.GenericExecutor),
                 module_file=self._trainer_module,
                 transformed_examples=self.transformed_examples_importer.
                 outputs['result'],
                 schema=self.schema_importer.outputs['result'],
                 transform_graph=self.transform_graph_importer.
                 outputs['result'],
                 train_args=trainer_pb2.TrainArgs(num_steps=10),
                 eval_args=trainer_pb2.EvalArgs(num_steps=5),
                 custom_config={
                     ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                     self._getCaipTrainingArgs(pipeline_name)
                 })
     ])
     self._compile_and_run_pipeline(pipeline)
     self._assertNumberOfTrainerOutputIsOne(pipeline_name)
Example #4
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={
            # LINT.IfChange
            'beam_pipeline_args': [
                # ----- Beam Args -----.
                '--runner=PortableRunner',
                # Points to the job server started in
                # setup_beam_on_(flink|spark).sh
                '--job_endpoint=localhost:8099',
                '--environment_type=LOOPBACK',
                # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment.  # pylint: disable=g-bad-todo
                # TODO(BEAM-5167): Use concurrency information from SDK Harness.  # pylint: disable=g-bad-todo
                # Note; We use 100 worker threads to mitigate the issue with
                # scheduling work between the Beam runner and SDK harness. Flink
                # and Spark can process unlimited work items concurrently while
                # SdkHarness can only process 1 work item per worker thread.
                # Having 100 threads will let 100 tasks execute concurrently
                # avoiding scheduling issue in most cases. In case the threads are
                # exhausted, beam print the relevant message in the log.
                '--experiments=worker_threads=100',
                # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all.  # pylint: disable=g-bad-todo
                '--experiments=pre_optimize=all',
                # ----- Flink runner-specific Args -----.
                # TODO(b/126725506): Set the task parallelism based on cpu cores.
                # TODO(FLINK-10672): Obviate setting BATCH_FORCED.
                '--execution_mode_for_batch=BATCH_FORCED',
            ],
            # LINT.ThenChange(setup/setup_beam_on_spark.sh)
            # LINT.ThenChange(../chicago_taxi/setup_beam_on_flink.sh)
        })
Example #5
0
    def setUp(self):
        super(ExecutorTest, self).setUp()
        self._source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        self._output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create input dict.
        e1 = standard_artifacts.Examples()
        e1.uri = os.path.join(self._source_data_dir,
                              'transform/transformed_examples')
        e1.split_names = artifact_utils.encode_split_names(['train', 'eval'])

        e2 = copy.deepcopy(e1)

        self._single_artifact = [e1]
        self._multiple_artifacts = [e1, e2]

        transform_graph = standard_artifacts.TransformGraph()
        transform_graph.uri = os.path.join(self._source_data_dir,
                                           'transform/transform_graph')

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(self._source_data_dir, 'schema_gen')
        previous_model = standard_artifacts.Model()
        previous_model.uri = os.path.join(self._source_data_dir,
                                          'trainer/previous')

        self._input_dict = {
            standard_component_specs.EXAMPLES_KEY: self._single_artifact,
            standard_component_specs.TRANSFORM_GRAPH_KEY: [transform_graph],
            standard_component_specs.SCHEMA_KEY: [schema],
            standard_component_specs.BASE_MODEL_KEY: [previous_model]
        }

        # Create output dict.
        self._model_exports = standard_artifacts.Model()
        self._model_exports.uri = os.path.join(self._output_data_dir,
                                               'model_export_path')
        self._model_run_exports = standard_artifacts.ModelRun()
        self._model_run_exports.uri = os.path.join(self._output_data_dir,
                                                   'model_run_path')
        self._output_dict = {
            standard_component_specs.MODEL_KEY: [self._model_exports],
            standard_component_specs.MODEL_RUN_KEY: [self._model_run_exports]
        }

        # Create exec properties skeleton.
        self._exec_properties = {
            standard_component_specs.TRAIN_ARGS_KEY:
            proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=1000)),
            standard_component_specs.EVAL_ARGS_KEY:
            proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=500)),
            'warm_starting':
            False,
        }

        self._module_file = os.path.join(
            self._source_data_dir, standard_component_specs.MODULE_FILE_KEY,
            'trainer_module.py')
        self._trainer_fn = '%s.%s' % (trainer_module.trainer_fn.__module__,
                                      trainer_module.trainer_fn.__name__)

        # Executors for test.
        self._trainer_executor = executor.Executor()
        self._generic_trainer_executor = executor.GenericExecutor()
Example #6
0
    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Builds the training pipeline as a series of TFX components.

        Args:
            config: A ZenML configuration in dictionary format.

        Returns:
            A chronological list of TFX components making up the training
             pipeline.

        """
        steps = config[keys.GlobalKeys.STEPS]

        component_list = []

        ############
        # RAW DATA #
        ############
        data_config = steps[keys.TrainingSteps.DATA]
        data = DataGen(source=data_config[keys.StepKeys.SOURCE],
                       source_args=data_config[keys.StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)

        statistics_data = StatisticsGen(
            examples=data.outputs.examples).with_id(
                GDPComponent.DataStatistics.name)

        schema_data = SchemaGen(
            statistics=statistics_data.outputs.output, ).with_id(
                GDPComponent.DataSchema.name)

        component_list.extend([data, statistics_data, schema_data])

        datapoints = data.outputs.examples

        #############
        # SPLITTING #
        #############
        # Block to read the data from the corresponding BQ table
        split_config = steps[keys.TrainingSteps.SPLIT]
        splits = SplitGen(
            input_examples=datapoints,
            source=split_config[keys.StepKeys.SOURCE],
            source_args=split_config[keys.StepKeys.ARGS],
            schema=schema_data.outputs.schema,
            statistics=statistics_data.outputs.output,
        ).with_id(GDPComponent.SplitGen.name)

        datapoints = splits.outputs.examples

        statistics_split = StatisticsGen(examples=datapoints).with_id(
            GDPComponent.SplitStatistics.name)

        schema_split = SchemaGen(
            statistics=statistics_split.outputs.output, ).with_id(
                GDPComponent.SplitSchema.name)

        schema = schema_split.outputs.schema

        component_list.extend([splits, statistics_split, schema_split])

        ##############
        # SEQUENCING #
        ##############
        if keys.TrainingSteps.SEQUENCER in steps:
            sequencer_config = steps[keys.TrainingSteps.SEQUENCER]
            sequencer = Sequencer(
                input_examples=datapoints,
                schema=schema,
                statistics=statistics_split.outputs.statistics,
                source=sequencer_config[keys.StepKeys.SOURCE],
                source_args=sequencer_config[keys.StepKeys.ARGS]).with_id(
                    GDPComponent.Sequencer.name)

            sequencer_statistics = StatisticsGen(
                examples=sequencer.outputs.output_examples).with_id(
                    GDPComponent.SequencerStatistics.name)

            sequencer_schema = SchemaGen(
                statistics=sequencer_statistics.outputs.output,
                infer_feature_shape=True,
            ).with_id(GDPComponent.SequencerSchema.name)

            datapoints = sequencer.outputs.output_examples
            schema = sequencer_schema.outputs.schema

            component_list.extend(
                [sequencer, sequencer_statistics, sequencer_schema])

        #################
        # PREPROCESSING #
        #################
        transform = Transform(
            preprocessing_fn=constants.PREPROCESSING_FN,
            examples=datapoints,
            schema=schema,
            custom_config=steps[keys.TrainingSteps.PREPROCESSER]).with_id(
                GDPComponent.Transform.name)

        component_list.extend([transform])

        ############
        # TRAINING #
        ############
        training_backend: TrainingLocalBackend = \
            self.backends_dict[TrainingLocalBackend.BACKEND_KEY]
        training_kwargs = {
            'custom_executor_spec': training_backend.get_executor_spec(),
            'custom_config': steps[keys.TrainingSteps.TRAINER]
        }
        training_kwargs['custom_config'].update(
            training_backend.get_custom_config())

        trainer = Trainer(
            transformed_examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            run_fn=constants.TRAINER_FN,
            schema=schema,
            train_args=trainer_pb2.TrainArgs(),
            eval_args=trainer_pb2.EvalArgs(),
            **training_kwargs).with_id(GDPComponent.Trainer.name)

        component_list.extend([trainer])

        #############
        # EVALUATOR #
        #############
        if keys.TrainingSteps.EVALUATOR in steps:
            from zenml.utils import source_utils
            eval_module = '.'.join(
                constants.EVALUATOR_MODULE_FN.split('.')[:-1])
            eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1]
            abs_path = source_utils.get_absolute_path_from_module(eval_module)
            custom_extractor_path = os.path.join(abs_path,
                                                 eval_module_file) + '.py'
            eval_step: TFMAEvaluator = TFMAEvaluator.from_config(
                steps[keys.TrainingSteps.EVALUATOR])
            eval_config = eval_step.build_eval_config()
            evaluator = Evaluator(
                examples=transform.outputs.transformed_examples,
                model=trainer.outputs.model,
                eval_config=eval_config,
                module_file=custom_extractor_path,
            ).with_id(GDPComponent.Evaluator.name)
            component_list.append(evaluator)

        ###########
        # SERVING #
        ###########
        if keys.TrainingSteps.DEPLOYER in steps:
            serving_args = steps[keys.TrainingSteps.DEPLOYER]['args']

            project_id = serving_args['project_id']
            output_base_dir = self.artifact_store.path
            if 'model_name' in serving_args:
                model_name = serving_args['model_name']
            else:
                model_name = self.pipeline_name().replace('-', '_')

            gcaip_deployer = GCAIPDeployer(output_base_dir=output_base_dir,
                                           project_id=project_id,
                                           model_name=model_name)

            pusher_config = gcaip_deployer.build_pusher_config()
            pusher_executor_spec = gcaip_deployer.get_executor_spec()

            pusher = Pusher(model_export=trainer.outputs.output,
                            custom_executor_spec=pusher_executor_spec,
                            **pusher_config).with_id(
                                GDPComponent.Deployer.name)

            component_list.append(pusher)

        return component_list
Example #7
0
def generate_pipeline(pipeline_name, pipeline_root, train_data, test_data,
                      train_steps, eval_steps, pusher_target, runner):
    module_file = 'util.py'  # util.py is a file in the same folder

    # RuntimeParameter is only supported on KubeflowDagRunner currently
    if runner == 'kubeflow':
        pipeline_root_param = os.path.join('gs://{{kfp-default-bucket}}',
                                           pipeline_name, '{{workflow.uid}}')
        train_data_param = data_types.RuntimeParameter(
            name='train-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/train',
            ptype=Text)
        test_data_param = data_types.RuntimeParameter(
            name='test-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/test',
            ptype=Text)
        pusher_target_param = data_types.RuntimeParameter(
            name='pusher-destination',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/serving',
            ptype=Text)
    else:
        pipeline_root_param = pipeline_root
        train_data_param = train_data
        test_data_param = test_data
        pusher_target_param = pusher_target

    examples = external_input(train_data_param)
    example_gen = CsvExampleGen(input=examples, instance_name="train")

    test_examples = external_input(test_data_param)
    test_example_gen = CsvExampleGen(input=test_examples,
                                     output_config={
                                         'split_config': {
                                             'splits': [{
                                                 'name': 'test',
                                                 'hash_buckets': 1
                                             }]
                                         }
                                     },
                                     instance_name="test")

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True
                           )  # infer_feature_shape controls sparse or dense

    # Transform is too slow in my side.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=module_file,
        train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
        eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps),
        instance_name="train",
        enable_cache=False)

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='target')],
        # tfma.SlicingSpec(feature_keys=['var_0', 'var_1']) when add more, Evaluator can't ouptput BLESSED status. It should be a bug in TFMA.
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.4}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config,
        instance_name="eval5")

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination={
                        'filesystem': {
                            'base_directory': pusher_target_param
                        }
                    })

    bulk_inferrer = BulkInferrer(
        examples=test_example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # model_blessing=evaluator.outputs['blessing'],
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec(),
        instance_name="bulkInferrer")

    hello = component.HelloComponent(
        input_data=bulk_inferrer.outputs['inference_result'],
        instance_name='csvGen')

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root_param,
        components=[
            example_gen, statistics_gen, schema_gen, transform, trainer,
            model_resolver, evaluator, pusher, hello, test_example_gen,
            bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            os.path.join(pipeline_root, 'metadata.sqlite')),
        beam_pipeline_args=['--direct_num_workers=0'])
Example #8
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     training_data_root: Text, inference_data_root: Text,
                     module_file: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    # Brings training data into the pipeline or otherwise joins/converts
    # training data.
    training_example_gen = CsvExampleGen(input_base=training_data_root,
                                         instance_name='training_example_gen')

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(
        input_data=training_example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=training_example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(examples=training_example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Brings inference data into the pipeline.
    inference_example_gen = CsvExampleGen(
        input_base=inference_data_root,
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='unlabelled',
                                                  hash_buckets=100)
            ])),
        instance_name='inference_example_gen')

    # Performs offline batch inference over inference examples.
    bulk_inferrer = BulkInferrer(
        examples=inference_example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        # Empty data_spec.example_splits will result in using all splits.
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec())

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            training_example_gen, inference_example_gen, statistics_gen,
            schema_gen, example_validator, transform, trainer, model_resolver,
            evaluator, bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Example #9
0
# TODO(step 4,step 8): Specify your GCS bucket name here.
#                      You have to use GCS to store output files when running a
#                      pipeline with Kubeflow Pipeline on GCP or when running a
#                      job using Dataflow.
GCS_BUCKET_NAME = 'my-test-project-249117-bank_risk'

# TODO(step 7,step 8,step 9): (Optional) Set your project ID and region to use
#                             GCP services including BigQuery, Dataflow and
#                             Cloud AI Platform.
# GCP_PROJECT_ID = 'YOUR_GCP_PROJECT_ID'
# GCP_REGION = 'YOUR_GCP_REGION'  # ex) 'us-central1'

PREPROCESSING_FN = 'preprocessing.preprocessing_fn'
TRAINER_FN = 'model.trainer_fn'

TRAIN_ARGS = trainer_pb2.TrainArgs(num_steps=100)
EVAL_ARGS = trainer_pb2.EvalArgs(num_steps=100)

# Beam args to use BigQueryExampleGen.
# TODO(step 7): (Optional) Uncomment here to provide GCP related configs for
#               BigQuery.
# BIG_QUERY_BEAM_PIPELINE_ARGS = [
#    '--project=' + GCP_PROJECT_ID,
#    ]

# The rate at which to sample rows from the Chicago Taxi dataset using BigQuery.
# The full taxi dataset is > 120M record.  In the interest of resource
# savings and time, we've set the default for this example to be much smaller.
# Feel free to crank it up and process the full dataset!
_query_sample_rate = 0.0001  # Generate a 0.01% random sample.
def init_components(data_dir,
                    module_file,
                    serving_model_dir=None,
                    ai_platform_training_args=None,
                    ai_platform_serving_args=None,
                    training_steps=1000,
                    eval_steps=200):
    """
    This function is to initialize tfx components
    """

    if serving_model_dir and ai_platform_serving_args:
        raise NotImplementedError(
            "Can't set ai_platform_serving_args and serving_model_dir at "
            "the same time. Choose one deployment option.")

    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=99),
            example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1),
        ]))

    example_gen = CsvExampleGen(input_base=data_dir, output_config=output)

    statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs["statistics"],
        infer_feature_shape=False,
    )

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs["statistics"],
        schema=schema_gen.outputs["schema"],
    )

    transform = Transform(
        examples=example_gen.outputs["examples"],
        schema=schema_gen.outputs["schema"],
        module_file=module_file,
    )

    training_kwargs = {
        "module_file": module_file,
        "examples": transform.outputs["transformed_examples"],
        "schema": schema_gen.outputs["schema"],
        "transform_graph": transform.outputs['transform_graph'],
        "train_args": trainer_pb2.TrainArgs(num_steps=training_steps),
        "eval_args": trainer_pb2.EvalArgs(num_steps=eval_steps),
    }

    if ai_platform_training_args:

        training_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(
                aip_trainer_executor.GenericExecutor),
            "custom_config": {
                aip_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args  # noqa
            },
        })
    else:
        training_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(GenericExecutor)
        })

    trainer = Trainer(**training_kwargs)

    model_resolver = ResolverNode(
        instance_name="latest_blessed_model_resolver",
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing),
    )

    #model_resolver for tfx==0.30.0
    # model_resolver = tfx.dsl.Resolver(
    #   strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy,
    #   model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),
    #   model_blessing=tfx.dsl.Channel(
    #       type=tfx.types.standard_artifacts.ModelBlessing)).with_id(
    #           'latest_blessed_model_resolver')

    #the book's eval_config might be wrong,
    #threshold has to be set within the tfma.MetricConfig() with each metric
    #this seems to have caused the models not be blessed
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key="consumer_disputed")],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=["product"]),
        ],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(class_name='ExampleCount'),
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={"value": 0.01},
                        ),
                    )),
                # tfma.MetricConfig(
                #     class_name='AUC',
                #     threshold=tfma.MetricThreshold(
                #         value_threshold=tfma.GenericValueThreshold(
                #             lower_bound={'value': 0.5}
                #             ),
                #         change_threshold=tfma.GenericChangeThreshold(
                #             direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                #             absolute={"value": 0.01},
                #         ),
                #         )
                #     ),
            ])
        ],
    )

    evaluator = Evaluator(
        examples=example_gen.outputs["examples"],
        model=trainer.outputs["model"],
        # baseline_model=model_resolver.outputs["model"],
        eval_config=eval_config,
    )

    pusher_kwargs = {
        "model": trainer.outputs["model"],
        "model_blessing": evaluator.outputs["blessing"],
    }

    if ai_platform_serving_args:

        pusher_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(aip_pusher_executor.Executor),
            "custom_config": {
                aip_pusher_executor.SERVING_ARGS_KEY:
                ai_platform_serving_args  # noqa
            },
        })
    elif serving_model_dir:
        pusher_kwargs.update({
            "push_destination":
            pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory=serving_model_dir))
        })
    else:
        raise NotImplementedError(
            "Provide ai_platform_serving_args or serving_model_dir.")

    pusher = Pusher(**pusher_kwargs)

    #compile all components in a list
    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        trainer,
        model_resolver,
        evaluator,
        pusher,
    ]
    return components
Example #11
0
def create_pipeline_components(
    pipeline_root: Text,
    transform_module: Text,
    trainer_module: Text,
    bigquery_query: Text = '',
    csv_input_location: Text = '',
) -> List[base_node.BaseNode]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.
    bigquery_query: The query to get input data from BigQuery. If not empty,
      BigQueryExampleGen will be used.
    csv_input_location: The location of the input data directory.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """

    if bool(bigquery_query) == bool(csv_input_location):
        raise ValueError(
            'Exactly one example gen is expected. ',
            'Please provide either bigquery_query or csv_input_location.')

    if bigquery_query:
        example_gen = big_query_example_gen_component.BigQueryExampleGen(
            query=bigquery_query)
    else:
        example_gen = components.CsvExampleGen(input_base=csv_input_location)

    statistics_gen = components.StatisticsGen(
        examples=example_gen.outputs['examples'])
    schema_gen = components.SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False)
    example_validator = components.ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = components.Transform(examples=example_gen.outputs['examples'],
                                     schema=schema_gen.outputs['schema'],
                                     module_file=transform_module)
    latest_model_resolver = resolver.Resolver(
        strategy_class=latest_artifacts_resolver.LatestArtifactsResolver,
        model=channel.Channel(type=standard_artifacts.Model)).with_id(
            'Resolver.latest_model_resolver')
    trainer = components.Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(Executor),
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Get the latest blessed model for model validation.
    model_resolver = resolver.Resolver(
        strategy_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=channel.Channel(type=standard_artifacts.Model),
        model_blessing=channel.Channel(
            type=standard_artifacts.ModelBlessing)).with_id(
                'Resolver.latest_blessed_model_resolver')
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = components.Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    pusher = components.Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen, statistics_gen, schema_gen, example_validator, transform,
        latest_model_resolver, trainer, model_resolver, evaluator, pusher
    ]
Example #12
0
def _create_pipeline(pipeline_root: Text,
                     csv_input_location: data_types.RuntimeParameter,
                     taxi_module_file: data_types.RuntimeParameter,
                     enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = external_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False,
    )
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'],
    )
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=infer_schema.outputs['schema'],
        module_file=taxi_module_file,
    )
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
    )
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[
            # Using signature 'eval' implies the use of an EvalSavedModel. To use
            # a serving model remove the signature to defaults to 'serving_default'
            # and add a label_key.
            tfma.ModelSpec(signature_name='eval')
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                # The metrics added here are in addition to those saved with the
                # model (assuming either a keras model or EvalSavedModel is used).
                # Any metrics added into the saved model (for example using
                # model.compile(..., metrics=[...]), etc) will be computed
                # automatically.
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                # To add validation thresholds for metrics saved with the model,
                # add them keyed by metric name to the thresholds map.
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            # An empty slice spec means the overall slice, i.e. the whole dataset.
            tfma.SlicingSpec(),
            # Data can be sliced along a feature column. In this case, data is
            # sliced along feature column trip_start_hour.
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])

    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        eval_config=eval_config,
    )

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=model_analyzer.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(str(pipeline.ROOT_PARAMETER),
                                            'model_serving'))),
    )

    return pipeline.Pipeline(
        pipeline_name='parameterized_tfx_oss',
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, pusher
        ],
        enable_cache=enable_cache,
    )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""
    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=data_root)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(Executor),
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
    )

    # Get the latest blessed model for model validation.
    model_resolver = resolver.Resolver(
        strategy_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(
            type=ModelBlessing)).with_id('latest_blessed_model_resolver')

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Performs infra validation of a candidate model to prevent unservable model
    # from being pushed. In order to use InfraValidator component, persistent
    # volume and its claim that the pipeline is using should be a ReadWriteMany
    # access mode.
    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            kubernetes=infra_validator_pb2.KubernetesConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    infra_blessing=infra_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=pipeline_root,
                             components=[
                                 example_gen,
                                 statistics_gen,
                                 schema_gen,
                                 example_validator,
                                 transform,
                                 trainer,
                                 model_resolver,
                                 evaluator,
                                 infra_validator,
                                 pusher,
                             ],
                             beam_pipeline_args=beam_pipeline_args)
Example #14
0
def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=_taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # This custom component serves as a bridge between pipeline and human model
  # reviewers to enable review-and-push workflow in model development cycle. It
  # utilizes Slack API to send message to user-defined Slack channel with model
  # URI info and wait for go / no-go decision from the same Slack channel:
  #   * To approve the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'lgtm' or 'approve'.
  #   * To reject the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'decline' or 'reject'.
  slack_validator = SlackComponent(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      slack_token=_slack_token,
      channel_id=_channel_id,
      timeout_sec=3600,
  )

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=slack_validator.outputs.slack_blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=_serving_model_dir)))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, slack_validator, pusher
  ]
Example #15
0
def create_pipeline(pipeline_name, pipeline_root, input_path,
                    tf_transform_file, tf_trainer_file, serving_model_basedir,
                    **kwargs):

    examples = tfrecord_input(input_path)

    input_config = example_gen_pb2.Input(splits=[
        example_gen_pb2.Input.Split(name='tfrecord',
                                    pattern='data_tfrecord-*.gz'),
    ])  # todo add as airflow var

    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(
                name='train', hash_buckets=2),  # todo add as airflow var
            example_gen_pb2.SplitConfig.Split(
                name='eval', hash_buckets=1)  # todo add as airflow var
        ]))
    example_gen = ImportExampleGen(input_base=examples,
                                   input_config=input_config,
                                   output_config=output_config)

    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=tf_transform_file)

    trainer = Trainer(
        module_file=tf_trainer_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=[])  # todo add your slicing column
        ]))

    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_basedir)))

    pipeline = Pipeline(pipeline_name=pipeline_name,
                        pipeline_root=pipeline_root,
                        **kwargs)
    pipeline.components = [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]

    return pipeline
Example #16
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the Iris flowers pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute an evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='variety')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Performs infra validation of a candidate model to prevent unservable model
    # from being pushed. This config will launch a model server of the latest
    # TensorFlow Serving image in a local docker engine.
    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            local_docker=infra_validator_pb2.LocalDockerConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    infra_blessing=infra_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            transform,
            trainer,
            model_resolver,
            evaluator,
            infra_validator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/142684737): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the imdb sentiment analysis pipline with TFX."""
    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=9),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))

    # Brings data in to the pipline
    example_gen = CsvExampleGen(input_base=data_root, output_config=output)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model.
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      transform_graph=transform.outputs['transform_graph'],
                      schema=schema_gen.outputs['schema'],
                      train_args=trainer_pb2.TrainArgs(num_steps=500),
                      eval_args=trainer_pb2.EvalArgs(num_steps=200))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='label')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            # Increase this threshold when training on complete
                            # dataset.
                            lower_bound={'value': 0.01}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-2})))
            ])
        ])

    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        trainer,
        model_resolver,
        evaluator,
        pusher,
    ]
    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        enable_cache=True,
        beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_root: Text,
                     csv_input_location: data_types.RuntimeParameter,
                     taxi_module_file: data_types.RuntimeParameter,
                     enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = external_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False,
    )
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'],
    )
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=infer_schema.outputs['schema'],
        module_file=taxi_module_file,
    )
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
    )
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]),
    )
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=model_validator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(str(pipeline.ROOT_PARAMETER),
                                            'model_serving'))),
    )

    return pipeline.Pipeline(
        pipeline_name='parameterized_tfx_oss',
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=enable_cache,
    )
Example #19
0
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""

    query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE RAND() < {}""".format(_query_sample_rate)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_utils)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=_taxi_utils,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
        custom_config={'cmle_training_args': _cmle_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    custom_config={'cmle_serving_args': _cmle_serving_args},
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=_serving_model_dir)))

    return [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]
Example #20
0
def build_pipeline(timestamp: str) -> pipeline:
    """
    Gather tfx components and produce the output pipeline
    """

    conf['beam']['serving_model_dir'] = f"{conf['beam']['serving_model_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['pipeline_root_dir'] = f"{conf['beam']['pipeline_root_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['metadata_path'] = f"{conf['beam']['metadata_path']}/beam/OL{653374}"

    logging.info("Serving model dir is now %s",conf['beam']['serving_model_dir'])

    example_gen = ImportExampleGen(input_base=conf['train_data'])

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False
    )
    
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file']
    )

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema']
    )

    trainer = Trainer(
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file'],
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), # define this to use run_fn instead of trainer_fn
        train_args=trainer_pb2.TrainArgs(num_steps=conf['train_args_steps']),
        eval_args=trainer_pb2.EvalArgs(num_steps=50)
    )

    metrics = [
        tfma.metrics.ExampleCount(name='example_count'),
        tfma.metrics.WeightedExampleCount(name='weighted_example_count'),
        tf.keras.metrics.BinaryCrossentropy(name='binary_crossentropy'),
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(name='auc', num_thresholds=10),
        tf.keras.metrics.AUC(
            name='auc_precision_recall', curve='PR', num_thresholds=100),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tfma.metrics.MeanLabel(name='mean_label'),
        tfma.metrics.MeanPrediction(name='mean_prediction'),
        tfma.metrics.Calibration(name='calibration'),
        tfma.metrics.ConfusionMatrixPlot(name='confusion_matrix_plot'),
        tfma.metrics.CalibrationPlot(name='calibration_plot')
    ]
    my_metrics_specs = tfma.metrics.specs_from_metrics(metrics)

    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(label_key='label')
        ],
        metrics_specs=my_metrics_specs
        # [
            # tfma.MetricsSpec(
                # metrics=[
                #     # tfma.MetricConfig(class_name='ExampleCount'),
                #     tfma.MetricConfig(class_name='BinaryAccuracy',
                #       threshold=tfma.MetricThreshold(
                #           value_threshold=tfma.GenericValueThreshold(
                #               lower_bound={'value': 0.5}),
                #           change_threshold=tfma.GenericChangeThreshold(
                #               direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                #               absolute={'value': -1e-10})))
                # ]
            # )
        # ],
        ,
        slicing_specs=[
            tfma.SlicingSpec(),
        ])

    model_resolver = ResolverNode(
          instance_name='latest_blessed_model_resolver',
          resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
          model=Channel(type=Model),
          model_blessing=Channel(type=ModelBlessing))

    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=conf['beam']['serving_model_dir'])))

    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        transform,
        example_validator,
        trainer,
        model_resolver,
        evaluator,
        pusher
    ]


    tfx_pipeline = pipeline.Pipeline(
        pipeline_name=conf['beam']['pipeline_name'],
        pipeline_root=conf['beam']['pipeline_root_dir'],
        components=components,
        enable_cache=False,
        metadata_connection_config=(
            metadata.sqlite_metadata_connection_config(conf['beam']['metadata_path'])

        )
    )

    return tfx_pipeline
Example #21
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     training_data_root: Text, inference_data_root: Text,
                     module_file: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX."""
  training_examples = external_input(training_data_root)

  # Brings training data into the pipeline or otherwise joins/converts
  # training data.
  training_example_gen = CsvExampleGen(
      input_base=training_examples, instance_name='training_example_gen')

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(
      input_data=training_example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=training_example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=training_example_gen.outputs['examples'],
      model_exports=trainer.outputs['model'],
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=training_example_gen.outputs['examples'],
      model=trainer.outputs['model'])

  inference_examples = external_input(inference_data_root)

  # Brings inference data into the pipeline.
  inference_example_gen = CsvExampleGen(
      input_base=inference_examples,
      output_config=example_gen_pb2.Output(
          split_config=example_gen_pb2.SplitConfig(
              splits=[example_gen_pb2.SplitConfig.Split(
                  name='unlabelled', hash_buckets=100)])),
      instance_name='inference_example_gen')

  # Performs offline batch inference over inference examples.
  bulk_inferrer = BulkInferrer(
      examples=inference_example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      # Empty data_spec.example_splits will result in using all splits.
      data_spec=bulk_inferrer_pb2.DataSpec(),
      model_spec=bulk_inferrer_pb2.ModelSpec())

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          training_example_gen, inference_example_gen, statistics_gen,
          infer_schema, validate_stats, transform, trainer, model_analyzer,
          model_validator, bulk_inferrer
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      # TODO(b/141578059): The multi-processing API might change.
      beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     trainer_module_file: Text, evaluator_module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the Penguin pipeline with TFX."""
    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=data_root)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # TODO(humichael): Handle applying transformation component in Milestone 3.

    # Uses user-provided Python function that trains a model using TF-Learn.
    # Num_steps is not provided during evaluation because the scikit-learn model
    # loads and evaluates the entire test set at once.
    # TODO(b/159470716): Make schema optional in Trainer.
    trainer = Trainer(
        module_file=trainer_module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        eval_args=trainer_pb2.EvalArgs())

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='species')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='Accuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(module_file=evaluator_module_file,
                          examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            trainer,
            model_resolver,
            evaluator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args,
    )
Example #23
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - examples: Examples used for training, must include 'train' and 'eval'
          splits.
        - transform_output: Optional input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None

    Raises:
      ValueError: When neither or both of 'module_file' and 'trainer_fn'
        are present in 'exec_properties'.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # TODO(zhitaoli): Deprecate this in a future version.
    if exec_properties.get('custom_config', None):
      cmle_args = exec_properties.get('custom_config',
                                      {}).get('cmle_training_args')
      if cmle_args:
        executor_class_path = '.'.join([Executor.__module__, Executor.__name__])
        absl.logging.warn(
            'Passing \'cmle_training_args\' to trainer directly is deprecated, '
            'please use extension executor at '
            'tfx.extensions.google_cloud_ai_platform.trainer.executor instead')

        return runner.start_cmle_training(input_dict, output_dict,
                                          exec_properties, executor_class_path,
                                          cmle_args)

    trainer_fn = self._GetTrainerFn(exec_properties)

    # Set up training parameters
    train_files = [
        _all_files_pattern(
            artifact_utils.get_split_uri(input_dict['examples'], 'train'))
    ]
    transform_output = artifact_utils.get_single_uri(
        input_dict['transform_output']) if input_dict.get(
            'transform_output', None) else None
    eval_files = [
        _all_files_pattern(
            artifact_utils.get_split_uri(input_dict['examples'], 'eval'))
    ]
    schema_file = io_utils.get_only_uri_in_dir(
        artifact_utils.get_single_uri(input_dict['schema']))

    train_args = trainer_pb2.TrainArgs()
    eval_args = trainer_pb2.EvalArgs()
    json_format.Parse(exec_properties['train_args'], train_args)
    json_format.Parse(exec_properties['eval_args'], eval_args)

    # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
    # num_steps=None.  Conversion of the proto to python will set the default
    # value of an int as 0 so modify the value here.  Tensorflow will raise an
    # error if num_steps <= 0.
    train_steps = train_args.num_steps or None
    eval_steps = eval_args.num_steps or None

    output_path = artifact_utils.get_single_uri(output_dict['output'])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    # Assemble warm start path if needed.
    warm_start_from = None
    if exec_properties.get('warm_starting') and exec_properties.get(
        'warm_start_from'):
      previous_model_dir = os.path.join(exec_properties['warm_start_from'],
                                        path_utils.SERVING_MODEL_DIR)
      if previous_model_dir and tf.io.gfile.exists(
          os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)):
        warm_start_from = previous_model_dir

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    hparams = _HParamWrapper(
        # A list of uris for train files.
        train_files=train_files,
        # An optional single uri for transform graph produced by TFT. Will be
        # None if not specified.
        transform_output=transform_output,
        # A single uri for the output directory of the serving model.
        serving_model_dir=serving_model_dir,
        # A list of uris for eval files.
        eval_files=eval_files,
        # A single uri for schema file.
        schema_file=schema_file,
        # Number of train steps.
        train_steps=train_steps,
        # Number of eval steps.
        eval_steps=eval_steps,
        # A single uri for the model directory to warm start from.
        warm_start_from=warm_start_from)

    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

    training_spec = trainer_fn(hparams, schema)

    # Train the model
    absl.logging.info('Training model.')
    tf.estimator.train_and_evaluate(training_spec['estimator'],
                                    training_spec['train_spec'],
                                    training_spec['eval_spec'])
    absl.logging.info('Training complete.  Model written to %s',
                      serving_model_dir)

    # Export an eval savedmodel for TFMA
    absl.logging.info('Exporting eval_savedmodel for TFMA.')
    tfma.export.export_eval_savedmodel(
        estimator=training_spec['estimator'],
        export_dir_base=eval_model_dir,
        eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

    absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
Example #24
0
def create_e2e_components(
    pipeline_root: Text,
    csv_input_location: Text,
    transform_module: Text,
    trainer_module: Text,
) -> List[BaseComponent]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """
    example_gen = CsvExampleGen(input_base=csv_input_location)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=transform_module)
    latest_model_resolver = resolver.Resolver(
        strategy_class=latest_artifact_strategy.LatestArtifactStrategy,
        latest_model=Channel(type=Model)).with_id('latest_model_resolver')
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['latest_model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config)

    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            kubernetes=infra_validator_pb2.KubernetesConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        latest_model_resolver,
        trainer,
        evaluator,
        infra_validator,
        pusher,
    ]
Example #25
0
def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=_taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=_serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name='chicago_taxi_simple',
      pipeline_root=_pipeline_root,
      components=[
          example_gen, statistics_gen, infer_schema, validate_stats, transform,
          trainer, model_analyzer, model_validator, pusher
      ],
      enable_cache=True,
      metadata_db_root=_metadata_db_root,
      additional_pipeline_args={'logger_args': logger_overrides},
  )
Example #26
0
 def testTaxiPipelineNewStyleCompatibility(self):
     examples = external_input('/tmp/fake/path')
     example_gen = CsvExampleGen(input=examples)
     self.assertIs(example_gen.inputs['input'],
                   example_gen.inputs['input_base'])
     statistics_gen = StatisticsGen(
         examples=example_gen.outputs['examples'])
     self.assertIs(statistics_gen.inputs['examples'],
                   statistics_gen.inputs['input_data'])
     infer_schema = SchemaGen(
         statistics=statistics_gen.outputs['statistics'])
     self.assertIs(infer_schema.inputs['statistics'],
                   infer_schema.inputs['stats'])
     self.assertIs(infer_schema.outputs['schema'],
                   infer_schema.outputs['output'])
     validate_examples = ExampleValidator(
         statistics=statistics_gen.outputs['statistics'],
         schema=infer_schema.outputs['schema'])
     self.assertIs(validate_examples.inputs['statistics'],
                   validate_examples.inputs['stats'])
     self.assertIs(validate_examples.outputs['anomalies'],
                   validate_examples.outputs['output'])
     transform = Transform(examples=example_gen.outputs['examples'],
                           schema=infer_schema.outputs['schema'],
                           module_file='/tmp/fake/module/file')
     self.assertIs(transform.inputs['examples'],
                   transform.inputs['input_data'])
     self.assertIs(transform.outputs['transform_graph'],
                   transform.outputs['transform_output'])
     trainer = Trainer(
         module_file='/tmp/fake/module/file',
         transformed_examples=transform.outputs['transformed_examples'],
         schema=infer_schema.outputs['schema'],
         transform_graph=transform.outputs['transform_graph'],
         train_args=trainer_pb2.TrainArgs(num_steps=10000),
         eval_args=trainer_pb2.EvalArgs(num_steps=5000))
     self.assertIs(trainer.inputs['transform_graph'],
                   trainer.inputs['transform_output'])
     self.assertIs(trainer.outputs['model'], trainer.outputs['output'])
     evaluator = Evaluator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'],
         feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
             evaluator_pb2.SingleSlicingSpec(
                 column_for_slicing=['trip_start_hour'])
         ]))
     self.assertIs(evaluator.inputs['model'],
                   evaluator.inputs['model_exports'])
     self.assertIs(evaluator.outputs['evaluation'],
                   evaluator.outputs['output'])
     model_validator = ModelValidator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'])
     pusher = Pusher(model=trainer.outputs['output'],
                     model_blessing=model_validator.outputs['blessing'],
                     push_destination=pusher_pb2.PushDestination(
                         filesystem=pusher_pb2.PushDestination.Filesystem(
                             base_directory='/fake/serving/dir')))
     self.assertIs(pusher.inputs['model'], pusher.inputs['model_export'])
     self.assertIs(pusher.outputs['pushed_model'],
                   pusher.outputs['model_push'])
Example #27
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX."""
  #examples = external_input(data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  #example_gen = CsvExampleGen(input=examples)
  example_gen = CsvExampleGen(input_base=data_root)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  schema_gen = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)

  # Performs anomaly detection based on statistics and data schema.
  example_validator = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=schema_gen.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Get the latest blessed model for model validation.
  model_resolver = ResolverNode(
      instance_name='latest_blessed_model_resolver',
      resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
      model=Channel(type=Model),
      model_blessing=Channel(type=ModelBlessing))

  # Uses TFMA to compute a evaluation statistics over features of a model and
  # perform quality validation of a candidate model (compared to a baseline).
  eval_config = tfma.EvalConfig(
      model_specs=[tfma.ModelSpec(signature_name='eval')],
      slicing_specs=[
          tfma.SlicingSpec(),
          tfma.SlicingSpec(feature_keys=['trip_start_hour'])
      ],
      metrics_specs=[
          tfma.MetricsSpec(
              thresholds={
                  'accuracy':
                      tfma.config.MetricThreshold(
                          value_threshold=tfma.GenericValueThreshold(
                              lower_bound={'value': 0.6}),
                          change_threshold=tfma.GenericChangeThreshold(
                              direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                              absolute={'value': -1e-10}))
              })
      ])
  evaluator = Evaluator(
      examples=example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      baseline_model=model_resolver.outputs['model'],
      # Change threshold will be ignored if there is no baseline (first run).
      eval_config=eval_config)

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=evaluator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, schema_gen, example_validator, transform,
          trainer, model_resolver, evaluator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      beam_pipeline_args=beam_pipeline_args)
Example #28
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000),
    )

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        # TODO(b/141578059): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
        additional_pipeline_args={},
    )
Example #29
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     module_file: Text,
                     presto_config: presto_config_pb2.PrestoConnConfig,
                     query: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    # Brings data into the pipeline or otherwise joins/converts training data
    example_gen = PrestoExampleGen(presto_config, query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs['output'])

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'],
                                      schema=infer_schema.outputs['output'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['output'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['output'],
        transform_output=transform.outputs['transform_output'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['output'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['output'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs['output'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        additional_pipeline_args={},
    )
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=_query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_utils)

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    try:
        from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor  # pylint: disable=g-import-not-at-top
        # Train using a custom executor. This requires TFX >= 0.14.
        trainer = Trainer(
            executor_class=ai_platform_trainer_executor.Executor,
            module_file=_taxi_utils,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={
                'ai_platform_training_args': _ai_platform_training_args
            })
    except ImportError:
        # Train using a deprecated flag.
        trainer = Trainer(
            module_file=_taxi_utils,
            transformed_examples=transform.outputs.transformed_examples,
            schema=infer_schema.outputs.output,
            transform_output=transform.outputs.transform_output,
            train_args=trainer_pb2.TrainArgs(num_steps=10000),
            eval_args=trainer_pb2.EvalArgs(num_steps=5000),
            custom_config={'cmle_training_args': _ai_platform_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a destination if check passed.
    try:
        from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor  # pylint: disable=g-import-not-at-top
        # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14.
        pusher = Pusher(executor_class=ai_platform_pusher_executor.Executor,
                        model_export=trainer.outputs.output,
                        model_blessing=model_validator.outputs.blessing,
                        custom_config={
                            'ai_platform_serving_args':
                            _ai_platform_serving_args
                        },
                        push_destination=pusher_pb2.PushDestination(
                            filesystem=pusher_pb2.PushDestination.Filesystem(
                                base_directory=_serving_model_dir)))
    except ImportError:
        # Deploy the model on Google Cloud AI Platform, using a deprecated flag.
        pusher = Pusher(
            model_export=trainer.outputs.output,
            model_blessing=model_validator.outputs.blessing,
            custom_config={'cmle_serving_args': _ai_platform_serving_args},
            push_destination=pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory=_serving_model_dir)))

        return pipeline.Pipeline(
            pipeline_name='chicago_taxi_pipeline_kubeflow',
            pipeline_root=_pipeline_root,
            components=[
                example_gen, statistics_gen, infer_schema, validate_stats,
                transform, trainer, model_analyzer, model_validator, pusher
            ],
            additional_pipeline_args={
                'beam_pipeline_args': [
                    '--runner=DataflowRunner',
                    '--experiments=shuffle_mode=auto',
                    '--project=' + _project_id,
                    '--temp_location=' + os.path.join(_output_bucket, 'tmp'),
                    '--region=' + _gcp_region,
                ],
                # Optional args:
                # 'tfx_image': custom docker image to use for components.
                # This is needed if TFX package is not installed from an RC
                # or released version.
            },
            log_root='/var/tmp/tfx/logs',
        )