Example #1
0
    def setUp(self):
        super(KubeflowDataflowIntegrationTest, self).setUp()

        # Example artifacts for testing.
        self.raw_examples_importer = ImporterNode(
            instance_name='raw_examples',
            source_uri=os.path.join(self._testdata_root, 'csv_example_gen'),
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            properties={'split_names': '["train", "eval"]'})

        # Schema artifact for testing.
        self.schema_importer = ImporterNode(
            instance_name='schema',
            source_uri=os.path.join(self._testdata_root, 'schema_gen'),
            artifact_type=standard_artifacts.Schema,
            reimport=True)

        # Model artifact for testing.
        self.model_1_importer = ImporterNode(
            instance_name='model_1',
            source_uri=os.path.join(self._testdata_root, 'trainer',
                                    'previous'),
            artifact_type=standard_artifacts.Model,
            reimport=True)
Example #2
0
    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Creates an inference pipeline out of TFX components.

        A inference pipeline is used to run a batch of data through a
        ML model via the BulkInferrer TFX component.

        Args:
            config: Dict. Contains a ZenML configuration used to build the
             data pipeline.

        Returns:
            A list of TFX components making up the data pipeline.
        """
        component_list = []

        # Load from model_uri
        model = ImporterNode(
            instance_name=GDPComponent.Trainer.name,
            source_uri=self.model_uri,
            artifact_type=standard_artifacts.Model)
        model_result = model.outputs.result

        deployer: BaseDeployerStep = \
            self.steps_dict[keys.TrainingSteps.DEPLOYER]
        pusher_config = deployer._build_pusher_args()
        pusher_executor_spec = deployer._get_executor_spec()
        pusher = Pusher(model_export=model_result,
                        custom_executor_spec=pusher_executor_spec,
                        **pusher_config).with_id(
            GDPComponent.Deployer.name)

        component_list.extend([model, pusher])
        return component_list
Example #3
0
    def setUp(self):
        super(KubeflowGCPIntegrationTest, self).setUp()

        # Example artifacts for testing.
        self.raw_examples_importer = ImporterNode(
            instance_name='raw_examples',
            source_uri=os.path.join(self._testdata_root, 'csv_example_gen'),
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            properties={'split_names': '["train", "eval"]'})

        # Transformed Example artifacts for testing.
        self.transformed_examples_importer = ImporterNode(
            instance_name='transformed_examples',
            source_uri=os.path.join(self._testdata_root, 'transform',
                                    'transformed_examples'),
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            properties={'split_names': '["train", "eval"]'})

        # Schema artifact for testing.
        self.schema_importer = ImporterNode(
            instance_name='schema',
            source_uri=os.path.join(self._testdata_root, 'schema_gen'),
            artifact_type=standard_artifacts.Schema,
            reimport=True)

        # TransformGraph artifact for testing.
        self.transform_graph_importer = ImporterNode(
            instance_name='transform_graph',
            source_uri=os.path.join(self._testdata_root, 'transform',
                                    'transform_output'),
            artifact_type=standard_artifacts.TransformGraph,
            reimport=True)

        # Model artifact for testing.
        self.model_1_importer = ImporterNode(
            instance_name='model_1',
            source_uri=os.path.join(self._testdata_root, 'trainer',
                                    'previous'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        self.model_2_importer = ImporterNode(
            instance_name='model_2',
            source_uri=os.path.join(self._testdata_root, 'trainer', 'current'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        # ModelBlessing artifact for testing.
        self.model_blessing_importer = ImporterNode(
            instance_name='model_blessing',
            source_uri=os.path.join(self._testdata_root, 'model_validator',
                                    'blessed'),
            artifact_type=standard_artifacts.ModelBlessing,
            reimport=True,
            custom_properties={'blessed': 1})
    def setUp(self):
        super(KubeflowGCPIntegrationTest, self).setUp()

        # Example artifacts for testing.
        self.raw_examples_importer = ImporterNode(
            instance_name='raw_examples',
            source_uri=[
                os.path.join(self._intermediate_data_root,
                             'csv_example_gen/examples/test-pipeline/train'),
                os.path.join(self._intermediate_data_root,
                             'csv_example_gen/examples/test-pipeline/eval')
            ],
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            split=['train', 'eval'])

        # Transformed Example artifacts for testing.
        self.transformed_examples_importer = ImporterNode(
            instance_name='transformed_examples',
            source_uri=[
                os.path.join(
                    self._intermediate_data_root,
                    'transform/transformed_examples/test-pipeline/train'),
                os.path.join(
                    self._intermediate_data_root,
                    'transform/transformed_examples/test-pipeline/eval')
            ],
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            split=['train', 'eval'])

        # Schema artifact for testing.
        self.schema_importer = ImporterNode(
            instance_name='schema',
            source_uri=os.path.join(self._intermediate_data_root,
                                    'schema_gen/output/test-pipeline'),
            artifact_type=standard_artifacts.Schema,
            reimport=True)

        # TransformGraph artifact for testing.
        self.transform_graph_importer = ImporterNode(
            instance_name='transform_graph',
            source_uri=os.path.join(
                self._intermediate_data_root,
                'transform/transform_output/test-pipeline'),
            artifact_type=standard_artifacts.TransformGraph,
            reimport=True)

        # Model artifact for testing.
        self.model_1_importer = ImporterNode(
            instance_name='model_1',
            source_uri=os.path.join(self._intermediate_data_root,
                                    'trainer/output/test-pipeline/1'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        self.model_2_importer = ImporterNode(
            instance_name='model_2',
            source_uri=os.path.join(self._intermediate_data_root,
                                    'trainer/output/test-pipeline/2'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        # ModelBlessing artifact for testing.
        self.model_blessing_importer = ImporterNode(
            instance_name='model_blessing',
            source_uri=os.path.join(self._intermediate_data_root,
                                    'model_validator/blessing/test-pipeline'),
            artifact_type=standard_artifacts.ModelBlessing,
            reimport=True)
Example #5
0
    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Creates an inference pipeline out of TFX components.

        A inference pipeline is used to run a batch of data through a
        ML model via the BulkInferrer TFX component.

        Args:
            config: Dict. Contains a ZenML configuration used to build the
             data pipeline.

        Returns:
            A list of TFX components making up the data pipeline.
        """
        component_list = []

        data_config = \
            config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS][
                keys.InferSteps.DATA]
        data = DataGen(name=self.datasource.name,
                       source=data_config[StepKeys.SOURCE],
                       source_args=data_config[StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)
        component_list.extend([data])

        # Handle timeseries
        # TODO: [LOW] Handle timeseries
        # if GlobalKeys. in train_config:
        #     schema = ImporterNode(instance_name='Schema',
        #                           source_uri=spec['schema_uri'],
        #                           artifact_type=standard_artifacts.Schema)
        #
        #     sequence_transform = SequenceTransform(
        #         examples=data.outputs.examples,
        #         schema=schema,
        #         config=train_config,
        #         instance_name=GDPComponent.SequenceTransform.name)
        #     datapoints = sequence_transform.outputs.output
        #     component_list.extend([schema, sequence_transform])

        # Load from model_uri
        model = ImporterNode(instance_name=GDPComponent.Trainer.name,
                             source_uri=self.model_uri,
                             artifact_type=standard_artifacts.Model)

        model_result = model.outputs.result

        infer_cfg = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS][
            keys.InferSteps.INFER]

        bulk_inferrer = BulkInferrer(source=infer_cfg[StepKeys.SOURCE],
                                     source_args=infer_cfg[StepKeys.ARGS],
                                     model=model_result,
                                     examples=data.outputs.examples,
                                     instance_name=GDPComponent.Inferrer.name)

        statistics = StatisticsGen(
            examples=bulk_inferrer.outputs.predictions).with_id(
                GDPComponent.DataStatistics.name)

        schema = SchemaGen(statistics=statistics.outputs.output, ).with_id(
            GDPComponent.DataSchema.name)

        component_list.extend([model, bulk_inferrer, statistics, schema])

        return component_list
Example #6
0
    def setUp(self):
        super(KubeflowGCPIntegrationTest, self).setUp()

        # Transformed Example artifacts for testing.
        self.transformed_examples_importer = ImporterNode(
            instance_name='transformed_examples',
            source_uri=os.path.join(self._testdata_root, 'transform',
                                    'transformed_examples'),
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            properties={'split_names': '["train", "eval"]'})

        # Schema artifact for testing.
        self.schema_importer = ImporterNode(
            instance_name='schema',
            source_uri=os.path.join(self._testdata_root, 'schema_gen'),
            artifact_type=standard_artifacts.Schema,
            reimport=True)

        # TransformGraph artifact for testing.
        self.transform_graph_importer = ImporterNode(
            instance_name='transform_graph',
            source_uri=os.path.join(self._testdata_root, 'transform',
                                    'transform_graph'),
            artifact_type=standard_artifacts.TransformGraph,
            reimport=True)

        # Model artifact for testing.
        self.model_1_importer = ImporterNode(
            instance_name='model_1',
            source_uri=os.path.join(self._testdata_root, 'trainer',
                                    'previous'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        self.model_2_importer = ImporterNode(
            instance_name='model_2',
            source_uri=os.path.join(self._testdata_root, 'trainer', 'current'),
            artifact_type=standard_artifacts.Model,
            reimport=True)

        # ModelBlessing artifact for testing.
        self.model_blessing_importer = ImporterNode(
            instance_name='model_blessing',
            source_uri=os.path.join(self._testdata_root, 'model_validator',
                                    'blessed'),
            artifact_type=standard_artifacts.ModelBlessing,
            reimport=True,
            custom_properties={'blessed': 1})

        ### Test data and modules for native Keras trainer and tuner.
        self._penguin_tuner_module = os.path.join(self._MODULE_ROOT,
                                                  'tuner_module.py')
        self.penguin_examples_importer = ImporterNode(
            instance_name='penguin_examples',
            source_uri=os.path.join(self._testdata_root, 'penguin', 'data'),
            artifact_type=standard_artifacts.Examples,
            reimport=True,
            properties={'split_names': '["train", "eval"]'})
        self.penguin_schema_importer = ImporterNode(
            instance_name='penguin_schema',
            source_uri=os.path.join(self._testdata_root, 'penguin', 'schema'),
            artifact_type=standard_artifacts.Schema,
            reimport=True)
Example #7
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     user_schema_path: Text, module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Import user-provided schema.
    user_schema_importer = ImporterNode(instance_name='import_user_schema',
                                        source_uri=user_schema_path,
                                        artifact_type=Schema)

    # Generates schema based on statistics files. Even we use user-provided schema
    # in downstream components, we still want to generate the schema of the newest
    # data so that user can compare and optionally update the schema to use.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=user_schema_importer.outputs['result'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=user_schema_importer.outputs['result'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=user_schema_importer.outputs['result'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, user_schema_importer, infer_schema,
            validate_stats, transform, trainer, model_analyzer,
            model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/141578059): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
        additional_pipeline_args={},
    )
Example #8
0
def create_pipeline(pipeline_name: Text,
                    pipeline_root: Text,
                    dataset_name: Text,
                    train_steps: data_types.RuntimeParameter,
                    eval_steps: data_types.RuntimeParameter,
                    accuracy_threshold: data_types.RuntimeParameter,
                    ai_platform_training_args: Dict[Text, Text],
                    ai_platform_serving_args: Dict[Text, Text],
                    beam_pipeline_args: List[Text],
                    model_regisrty_uri: Text,
                    enable_cache: Optional[bool] = False) -> pipeline.Pipeline:
    """Implements the online news pipeline with TFX."""

    # Dataset, table and/or 'where conditions' can be passed as pipeline args.
    query = sql_utils.generate_source_query(dataset_name=dataset_name)

    # Brings data into the pipeline from BigQuery.
    example_gen = tfx.components.BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = tfx.components.StatisticsGen(
        input_data=example_gen.outputs.examples)

    # Import schema from local directory.
    schema_importer = ImporterNode(
        instance_name='RawSchemaImporter',
        source_uri=RAW_SCHEMA_DIR,
        artifact_type=Schema,
    )

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = tfx.components.ExampleValidator(
        stats=statistics_gen.outputs.output,
        schema=schema_importer.outputs.result)

    # Performs transformations and feature engineering in training and serving.
    transform = tfx.components.Transform(
        input_data=example_gen.outputs.examples,
        schema=schema_importer.outputs.result,
        module_file=TRANSFORM_MODULE_FILE)

    # Get the latest blessed model for model validation.
    latest_model_resolver = tfx.components.ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Train and save model for evaluation and serving.
    trainer = tfx.components.Trainer(
        #         custom_executor_spec=executor_spec.ExecutorClassSpec(
        #             ai_platform_trainer_executor.GenericExecutor),
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            trainer_executor.GenericExecutor),
        module_file=TRAIN_MODULE_FILE,
        transformed_examples=transform.outputs.transformed_examples,
        schema=schema_importer.outputs.result,
        transform_output=transform.outputs.transform_output,
        base_model=latest_model_resolver.outputs.model,
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={'ai_platform_training_args': ai_platform_training_args})

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_evaluator = tfx.components.Evaluator(
        examples=example_gen.outputs.examples,
        model=trainer.outputs.model,
        baseline_model=latest_model_resolver.outputs.model,
        eval_config=helper.get_eval_config())

    # Use a custom AccuracyModelValidator component to validate the model.
    model_validator = AccuracyModelValidator(
        eval_results=model_evaluator.outputs.output,
        model=trainer.outputs.model,
        accuracy_threshold=accuracy_threshold,
        slice_accuracy_tolerance=0.15,
    )

    #     # Checks whether the model passed the validation steps and pushes the model
    #     # to its destination if check passed.
    #     pusher = tfx.components.Pusher(
    #         custom_executor_spec=executor_spec.ExecutorClassSpec(
    #             ai_platform_pusher_executor.Executor),
    #         model_export=trainer.outputs.output,
    #         model_blessing=model_evaluator.outputs.blessing,
    #         #model_blessing=model_validator.outputs.blessing,
    #         custom_config={'ai_platform_serving_args': ai_platform_serving_args}
    #     )

    register = tfx.components.Pusher(
        model=trainer.outputs.model,
        model_blessing=model_validator.outputs.blessing,
        #model_blessing=model_evaluator.outputs.blessing,
        push_destination=tfx.proto.pusher_pb2.PushDestination(
            filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(model_regisrty_uri,
                                            pipeline_name))))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_importer,
            validate_stats,
            latest_model_resolver,
            transform,
            trainer,
            model_evaluator,
            model_validator,
            #pusher
            register
        ],
        enable_cache=enable_cache,
        beam_pipeline_args=beam_pipeline_args)