Exemple #1
0
    def testIsImporter(self):
        importer = ImporterNode(instance_name="import_schema",
                                source_uri="uri/to/schema",
                                artifact_type=standard_artifacts.Schema)
        self.assertTrue(compiler_utils.is_importer(importer))

        example_gen = CsvExampleGen(input=external_input("data_path"))
        self.assertFalse(compiler_utils.is_importer(example_gen))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     user_schema_path: Text, module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Import user-provided schema.
    user_schema_importer = ImporterNode(instance_name='import_user_schema',
                                        source_uri=user_schema_path,
                                        artifact_type=Schema)

    # Generates schema based on statistics files. Even we use user-provided schema
    # in downstream components, we still want to generate the schema of the newest
    # data so that user can compare and optionally update the schema to use.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=user_schema_importer.outputs['result'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=user_schema_importer.outputs['result'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=user_schema_importer.outputs['result'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, user_schema_importer, schema_gen,
            example_validator, transform, trainer, model_resolver, evaluator,
            pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Exemple #3
0
def create_test_pipeline():
    """Builds an Iris example pipeline with slight changes."""
    pipeline_name = "iris"
    iris_root = "iris_root"
    serving_model_dir = os.path.join(iris_root, "serving_model", pipeline_name)
    tfx_root = "tfx_root"
    data_path = os.path.join(tfx_root, "data_path")
    pipeline_root = os.path.join(tfx_root, "pipelines", pipeline_name)

    example_gen = CsvExampleGen(input_base=data_path)

    statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"])

    importer = ImporterNode(instance_name="my_importer",
                            source_uri="m/y/u/r/i",
                            properties={
                                "split_names": "['train', 'eval']",
                            },
                            custom_properties={
                                "int_custom_property": 42,
                                "str_custom_property": "42",
                            },
                            artifact_type=standard_artifacts.Examples)

    schema_gen = SchemaGen(statistics=statistics_gen.outputs["statistics"],
                           infer_feature_shape=True)

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs["statistics"],
        schema=schema_gen.outputs["schema"])

    trainer = Trainer(
        # Use RuntimeParameter as module_file to test out RuntimeParameter in
        # compiler.
        module_file=data_types.RuntimeParameter(name="module_file",
                                                default=os.path.join(
                                                    iris_root,
                                                    "iris_utils.py"),
                                                ptype=str),
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=example_gen.outputs["examples"],
        schema=schema_gen.outputs["schema"],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        # Attaching `TrainerArgs` as platform config is not sensible practice,
        # but is only for testing purpose.
        eval_args=trainer_pb2.EvalArgs(num_steps=5)).with_platform_config(
            config=trainer_pb2.TrainArgs(num_steps=2000))

    model_resolver = ResolverNode(
        instance_name="latest_blessed_model_resolver",
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        baseline_model=Channel(type=standard_artifacts.Model,
                               producer_component_id="Trainer"),
        # Cannot add producer_component_id="Evaluator" for model_blessing as it
        # raises "producer component should have already been compiled" error.
        model_blessing=Channel(type=standard_artifacts.ModelBlessing))

    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name="eval")],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    "sparse_categorical_accuracy":
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={"value": 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={"value": -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs["examples"],
        model=trainer.outputs["model"],
        baseline_model=model_resolver.outputs["baseline_model"],
        eval_config=eval_config)

    pusher = Pusher(model=trainer.outputs["model"],
                    model_blessing=evaluator.outputs["blessing"],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            importer,
            schema_gen,
            example_validator,
            trainer,
            model_resolver,
            evaluator,
            pusher,
        ],
        enable_cache=False,
        beam_pipeline_args=["--my_testing_beam_pipeline_args=bar"],
        # Attaching `TrainerArgs` as platform config is not sensible practice,
        # but is only for testing purpose.
        platform_config=trainer_pb2.TrainArgs(num_steps=2000),
        execution_mode=pipeline.ExecutionMode.ASYNC)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     user_schema_path: Text, module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Import user-provided schema.
    user_schema_importer = ImporterNode(instance_name='import_user_schema',
                                        source_uri=user_schema_path,
                                        artifact_type=Schema)

    # Generates schema based on statistics files. Even we use user-provided schema
    # in downstream components, we still want to generate the schema of the newest
    # data so that user can compare and optionally update the schema to use.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=user_schema_importer.outputs['result'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=user_schema_importer.outputs['result'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=user_schema_importer.outputs['result'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, user_schema_importer, infer_schema,
            validate_stats, transform, trainer, model_analyzer,
            model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/141578059): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
Exemple #5
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    serving_model_uri: Text,
    data_root_uri: Union[Text, data_types.RuntimeParameter],
    schema_folder_uri: Union[Text, data_types.RuntimeParameter],
    train_steps: Union[int, data_types.RuntimeParameter],
    eval_steps: Union[int, data_types.RuntimeParameter],
    beam_pipeline_args: List[Text],
    trainer_custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
    trainer_custom_config: Optional[Dict[Text, Any]] = None,
    enable_tuning: Optional[bool] = False,
    enable_cache: Optional[bool] = False,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None
) -> pipeline.Pipeline:
    """Trains and deploys the Keras Covertype Classifier with TFX and AI Platform Pipelines."""

    # Brings data into the pipeline and splits the data into training and eval splits
    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))

    examplegen = CsvExampleGen(input_base=data_root_uri)

    # Computes statistics over data for visualization and example validation.
    statisticsgen = StatisticsGen(examples=examplegen.outputs.examples)

    # Generates schema based on statistics files. Even though, we use user-provided schema
    # we still want to generate the schema of the newest data for tracking and comparison
    schemagen = SchemaGen(statistics=statisticsgen.outputs.statistics)

    # Import a user-provided schema
    import_schema = ImporterNode(
        instance_name='import_user_schema',
        #source_uri=SCHEMA_FOLDER,
        source_uri=schema_folder_uri,
        artifact_type=Schema)

    # Performs anomaly detection based on statistics and data schema.
    examplevalidator = ExampleValidator(
        statistics=statisticsgen.outputs.statistics,
        schema=import_schema.outputs.result)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=examplegen.outputs.examples,
                          schema=import_schema.outputs.result,
                          module_file=TRANSFORM_MODULE_FILE)

    # Tunes the hyperparameters for model training based on user-provided Python
    # function. Note that once the hyperparameters are tuned, you can drop the
    # Tuner component from pipeline and feed Trainer with tuned hyperparameters.
    if enable_tuning:
        # The Tuner component launches 1 AI Platform Training job for flock management.
        # For example, 3 workers (defined by num_parallel_trials) in the flock
        # management AI Platform Training job, each runs Tuner.Executor.
        tuner = Tuner(
            module_file=TRAIN_MODULE_FILE,
            examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            train_args={'num_steps': train_steps},
            eval_args={'num_steps': eval_steps},
            tune_args=tuner_pb2.TuneArgs(
                # num_parallel_trials=3 means that 3 search loops are running in parallel.
                num_parallel_trials=3),
            custom_config=custom_config)

    # Trains the model using a user provided trainer function.
    trainer = Trainer(
        custom_executor_spec=trainer_custom_executor_spec,
        module_file=TRAIN_MODULE_FILE,
        transformed_examples=transform.outputs.transformed_examples,
        schema=import_schema.outputs.result,
        transform_graph=transform.outputs.transform_graph,
        hyperparameters=(tuner.outputs.best_hyperparameters
                         if enable_tuning else None),
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config=trainer_custom_config)

    # Get the latest blessed model for model validation.
    resolver = ResolverNode(instance_name='latest_blessed_model_resolver',
                            resolver_class=latest_blessed_model_resolver.
                            LatestBlessedModelResolver,
                            model=Channel(type=Model),
                            model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    accuracy_threshold = tfma.MetricThreshold(
        value_threshold=tfma.GenericValueThreshold(lower_bound={'value': 0.5},
                                                   upper_bound={'value':
                                                                0.99}), )

    metrics_specs = tfma.MetricsSpec(metrics=[
        tfma.MetricConfig(class_name='SparseCategoricalAccuracy',
                          threshold=accuracy_threshold),
        tfma.MetricConfig(class_name='ExampleCount')
    ])

    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='Cover_Type')],
        metrics_specs=[metrics_specs],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['Wilderness_Area'])
        ])

    evaluator = Evaluator(examples=examplegen.outputs.examples,
                          model=trainer.outputs.model,
                          baseline_model=resolver.outputs.model,
                          eval_config=eval_config)

    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_uri)))

    components = [
        examplegen, statisticsgen, schemagen, import_schema, examplevalidator,
        transform, trainer, resolver, evaluator, pusher
    ]

    if enable_tuning:
        components.append(tuner)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=enable_cache,
        beam_pipeline_args=beam_pipeline_args,
        metadata_connection_config=metadata_connection_config)
Exemple #6
0
def create_pipeline(pipeline_name: Text,
                    pipeline_root: Text,
                    data_root_uri: data_types.RuntimeParameter,
                    train_steps: data_types.RuntimeParameter,
                    eval_steps: data_types.RuntimeParameter,
                    enable_tuning: bool,
                    ai_platform_training_args: Dict[Text, Text],
                    ai_platform_serving_args: Dict[Text, Text],
                    beam_pipeline_args: List[Text],
                    enable_cache: Optional[bool] = False) -> pipeline.Pipeline:
    """Trains and deploys the Keras Covertype Classifier with TFX and Kubeflow Pipeline on Google Cloud.
  Args:
    pipeline_name: name of the TFX pipeline being created.
    pipeline_root: root directory of the pipeline. Should be a valid GCS path.
    data_root_uri: uri of the dataset.
    train_steps: runtime parameter for number of model training steps for the Trainer component.
    eval_steps: runtime parameter for number of model evaluation steps for the Trainer component.
    enable_tuning: If True, the hyperparameter tuning through CloudTuner is
      enabled.    
    ai_platform_training_args: Args of CAIP training job. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job
      for detailed description.
    ai_platform_serving_args: Args of CAIP model deployment. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models
      for detailed description.
    beam_pipeline_args: Optional list of beam pipeline options. Please refer to
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options.
      When this argument is not provided, the default is to use GCP
      DataflowRunner with 50GB disk size as specified in this function. If an
      empty list is passed in, default specified by Beam will be used, which can
      be found at
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options
    enable_cache: Optional boolean
  Returns:
    A TFX pipeline object.
  """

    # Brings data into the pipeline and splits the data into training and eval splits
    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))

    examplegen = CsvExampleGen(input_base=data_root_uri, output_config=output)

    # Computes statistics over data for visualization and example validation.
    statisticsgen = StatisticsGen(examples=examplegen.outputs.examples)

    # Generates schema based on statistics files. Even though, we use user-provided schema
    # we still want to generate the schema of the newest data for tracking and comparison
    schemagen = SchemaGen(statistics=statisticsgen.outputs.statistics)

    # Import a user-provided schema
    import_schema = ImporterNode(instance_name='import_user_schema',
                                 source_uri=SCHEMA_FOLDER,
                                 artifact_type=Schema)

    # Performs anomaly detection based on statistics and data schema.
    examplevalidator = ExampleValidator(
        statistics=statisticsgen.outputs.statistics,
        schema=import_schema.outputs.result)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=examplegen.outputs.examples,
                          schema=import_schema.outputs.result,
                          module_file=TRANSFORM_MODULE_FILE)

    # Tunes the hyperparameters for model training based on user-provided Python
    # function. Note that once the hyperparameters are tuned, you can drop the
    # Tuner component from pipeline and feed Trainer with tuned hyperparameters.
    if enable_tuning:
        # The Tuner component launches 1 AI Platform Training job for flock management.
        # For example, 3 workers (defined by num_parallel_trials) in the flock
        # management AI Platform Training job, each runs Tuner.Executor.
        tuner = Tuner(
            module_file=TRAIN_MODULE_FILE,
            examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            train_args={'num_steps': train_steps},
            eval_args={'num_steps': eval_steps},
            tune_args=tuner_pb2.TuneArgs(
                # num_parallel_trials=3 means that 3 search loops are running in parallel.
                num_parallel_trials=3),
            custom_config={
                # Configures Cloud AI Platform-specific configs. For details, see
                # https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#traininginput.
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args
            })

    # Trains the model using a user provided trainer function.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.GenericExecutor),
        module_file=TRAIN_MODULE_FILE,
        transformed_examples=transform.outputs.transformed_examples,
        schema=import_schema.outputs.result,
        transform_graph=transform.outputs.transform_graph,
        hyperparameters=(tuner.outputs.best_hyperparameters
                         if enable_tuning else None),
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={'ai_platform_training_args': ai_platform_training_args})

    # Get the latest blessed model for model validation.
    resolver = ResolverNode(instance_name='latest_blessed_model_resolver',
                            resolver_class=latest_blessed_model_resolver.
                            LatestBlessedModelResolver,
                            model=Channel(type=Model),
                            model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    accuracy_threshold = tfma.MetricThreshold(
        value_threshold=tfma.GenericValueThreshold(lower_bound={'value': 0.5},
                                                   upper_bound={'value':
                                                                0.99}), )

    metrics_specs = tfma.MetricsSpec(metrics=[
        tfma.MetricConfig(class_name='SparseCategoricalAccuracy',
                          threshold=accuracy_threshold),
        tfma.MetricConfig(class_name='ExampleCount')
    ])

    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='Cover_Type')],
        metrics_specs=[metrics_specs],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['Wilderness_Area'])
        ])

    evaluator = Evaluator(examples=examplegen.outputs.examples,
                          model=trainer.outputs.model,
                          baseline_model=resolver.outputs.model,
                          eval_config=eval_config)

    # Validate model can be loaded and queried in sand-boxed environment
    # mirroring production.
    serving_config = infra_validator_pb2.ServingSpec(
        tensorflow_serving=infra_validator_pb2.TensorFlowServing(
            tags=['latest']),
        kubernetes=infra_validator_pb2.KubernetesConfig(),
    )

    validation_config = infra_validator_pb2.ValidationSpec(
        max_loading_time_seconds=60,
        num_tries=3,
    )

    request_config = infra_validator_pb2.RequestSpec(
        tensorflow_serving=infra_validator_pb2.TensorFlowServingRequestSpec(),
        num_examples=3,
    )

    infravalidator = InfraValidator(
        model=trainer.outputs.model,
        examples=examplegen.outputs.examples,
        serving_spec=serving_config,
        validation_spec=validation_config,
        request_spec=request_config,
    )

    # Checks whether the model passed the validation steps and pushes the model
    # to CAIP Prediction if checks are passed.
    pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=trainer.outputs.model,
                    model_blessing=evaluator.outputs.blessing,
                    infra_blessing=infravalidator.outputs.blessing,
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    components = [
        examplegen, statisticsgen, schemagen, import_schema, examplevalidator,
        transform, trainer, resolver, evaluator, infravalidator, pusher
    ]

    if enable_tuning:
        components.append(tuner)

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=pipeline_root,
                             components=components,
                             enable_cache=enable_cache,
                             beam_pipeline_args=beam_pipeline_args)
Exemple #7
0
def create_pipeline(pipeline_name: Text,
                    pipeline_root: Text,
                    data_root_uri: data_types.RuntimeParameter,
                    train_steps: data_types.RuntimeParameter,
                    eval_steps: data_types.RuntimeParameter,
                    ai_platform_training_args: Dict[Text, Text],
                    ai_platform_serving_args: Dict[Text, Text],
                    beam_pipeline_args: List[Text],
                    enable_cache: Optional[bool] = False) -> pipeline.Pipeline:
    """Trains and deploys the Covertype classifier."""

    # Brings data into the pipeline and splits the data into training and eval splits
    examples = external_input(data_root_uri)
    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))
    generate_examples = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    generate_statistics = StatisticsGen(
        examples=generate_examples.outputs.examples)

    # Import a user-provided schema
    import_schema = ImporterNode(instance_name='import_user_schema',
                                 source_uri=SCHEMA_FOLDER,
                                 artifact_type=Schema)

    # Generates schema based on statistics files.Even though, we use user-provided schema
    # we still want to generate the schema of the newest data for tracking and comparison
    infer_schema = SchemaGen(statistics=generate_statistics.outputs.statistics)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=generate_statistics.outputs.statistics,
        schema=import_schema.outputs.result)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=generate_examples.outputs.examples,
                          schema=import_schema.outputs.result,
                          module_file=TRANSFORM_MODULE_FILE)

    # Trains the model using a user provided trainer function.
    train = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.GenericExecutor),
        #      custom_executor_spec=executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
        module_file=TRAIN_MODULE_FILE,
        transformed_examples=transform.outputs.transformed_examples,
        schema=import_schema.outputs.result,
        transform_graph=transform.outputs.transform_graph,
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={'ai_platform_training_args': ai_platform_training_args})

    # Get the latest blessed model for model validation.
    resolve = ResolverNode(instance_name='latest_blessed_model_resolver',
                           resolver_class=latest_blessed_model_resolver.
                           LatestBlessedModelResolver,
                           model=Channel(type=Model),
                           model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    accuracy_threshold = tfma.MetricThreshold(
        value_threshold=tfma.GenericValueThreshold(lower_bound={'value': 0.5},
                                                   upper_bound={'value':
                                                                0.99}),
        change_threshold=tfma.GenericChangeThreshold(
            absolute={'value': 0.0001},
            direction=tfma.MetricDirection.HIGHER_IS_BETTER),
    )

    metrics_specs = tfma.MetricsSpec(metrics=[
        tfma.MetricConfig(class_name='SparseCategoricalAccuracy',
                          threshold=accuracy_threshold),
        tfma.MetricConfig(class_name='ExampleCount')
    ])

    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='Cover_Type')],
        metrics_specs=[metrics_specs],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['Wilderness_Area'])
        ])

    analyze = Evaluator(examples=generate_examples.outputs.examples,
                        model=train.outputs.model,
                        baseline_model=resolve.outputs.model,
                        eval_config=eval_config)

    # Validate model can be loaded and queried in sand-boxed environment
    # mirroring production.
    serving_config = infra_validator_pb2.ServingSpec(
        tensorflow_serving=infra_validator_pb2.TensorFlowServing(
            tags=['latest']),
        kubernetes=infra_validator_pb2.KubernetesConfig(),
    )

    validation_config = infra_validator_pb2.ValidationSpec(
        max_loading_time_seconds=60,
        num_tries=3,
    )

    request_config = infra_validator_pb2.RequestSpec(
        tensorflow_serving=infra_validator_pb2.TensorFlowServingRequestSpec(),
        num_examples=3,
    )

    infra_validate = InfraValidator(
        model=train.outputs['model'],
        examples=generate_examples.outputs['examples'],
        serving_spec=serving_config,
        validation_spec=validation_config,
        request_spec=request_config,
    )

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    deploy = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=train.outputs['model'],
                    model_blessing=analyze.outputs['blessing'],
                    infra_blessing=infra_validate.outputs['blessing'],
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=pipeline_root,
                             components=[
                                 generate_examples, generate_statistics,
                                 import_schema, infer_schema, validate_stats,
                                 transform, train, resolve, analyze,
                                 infra_validate, deploy
                             ],
                             enable_cache=enable_cache,
                             beam_pipeline_args=beam_pipeline_args)
Exemple #8
0
def create_pipeline(pipeline_name: Text,
                    pipeline_root: Text,
                    pipeline_mod: Text,
                    schema_uri: Text,
                    transform_graph_uri: Text,
                    model_uri: Text,
                    query: Text,
                    num_train_steps: int,
                    num_eval_steps: int,
                    beam_pipeline_args: Optional[List[Text]] = None,
                    ai_platform_training_args: Optional[Dict[Text,
                                                             Text]] = None,
                    metadata_path: Optional[Text] = None) -> pipeline.Pipeline:
    """Implements the incremental pipeline.."""

    example_gen = BigQueryExampleGen(
        query=query,
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train',
                                                  hash_buckets=20),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ])))

    importer = ImporterNode(instance_name='import_schema',
                            source_uri=schema_uri,
                            artifact_type=standard_artifacts.Schema,
                            reimport=False)
    graph_importer = ImporterNode(
        instance_name='import_transform_graph',
        source_uri=transform_graph_uri,
        artifact_type=standard_artifacts.TransformGraph,
        reimport=False)
    model_importer = ImporterNode(instance_name='import_model',
                                  source_uri=model_uri,
                                  artifact_type=standard_artifacts.Model,
                                  reimport=False)

    # Performs transformations and feature engineering in training and serving.
    transform = TransformWithGraph(
        examples=example_gen.outputs['examples'],
        schema=importer.outputs['result'],
        transform_graph=graph_importer.outputs['result'])

    trainer_kwargs = {}
    if ai_platform_training_args is not None:
        trainer_kwargs = {
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.Executor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args
            }
        }

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=importer.outputs['result'],
        transform_graph=graph_importer.outputs['result'],
        train_args=trainer_pb2.TrainArgs(num_steps=num_train_steps),
        eval_args=trainer_pb2.EvalArgs(num_steps=num_eval_steps),
        trainer_fn='{}.trainer_fn'.format(pipeline_mod),
        base_model=model_importer.outputs['result'],
        **trainer_kwargs)

    # Not depdent on blessing. Always pushes regardless of quality.
    pusher = AlwaysPusher(model=trainer.outputs['model'],
                          push_destination=pusher_pb2.PushDestination(
                              filesystem=pusher_pb2.PushDestination.Filesystem(
                                  base_directory=model_uri)))

    pipeline_kwargs = {}
    if metadata_path is not None:
        pipeline_kwargs = {
            'metadata_connection_config':
            metadata.sqlite_metadata_connection_config(metadata_path),
        }

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=pipeline_root,
                             components=[
                                 example_gen, importer, graph_importer,
                                 model_importer, transform, trainer, pusher
                             ],
                             enable_cache=True,
                             beam_pipeline_args=beam_pipeline_args,
                             **pipeline_kwargs)
Exemple #9
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # query: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:
    """Implements the Centernet pipeline with TFX."""
    components = []

    output_config = example_gen_pb2.Output(
        split_config=example_gen_pb2.
        SplitConfig(splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ],
                    partition_feature_name='image/filename'))

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = ImportExampleGen(input=external_input(data_path),
                                   output_config=output_config)
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'],
                                   stats_options=STATS_OPTIONS)
    components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    components.append(schema_gen)

    # Import manually crafted schema
    importer_node = ImporterNode(
        instance_name='import_user_schema',
        source_uri="gs://raw_data_layer/schema/",
        artifact_type=tfx.types.standard_artifacts.Schema)
    components.append(importer_node)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=importer_node.outputs['result'])
    components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=importer_node.outputs['result'],
                          preprocessing_fn=preprocessing_fn)
    components.append(transform)

    # update training_args per once use.
    trainer_args = {
        'run_fn':
        run_fn,
        'transformed_examples':
        transform.outputs['transformed_examples'],
        'schema':
        importer_node.outputs['result'],
        'transform_graph':
        transform.outputs['transform_graph'],
        'train_args':
        train_args,
        'eval_args':
        eval_args,
        'custom_executor_spec':
        executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
    }
    if ai_platform_training_args is not None:
        trainer_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.GenericExecutor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args,
            }
        })
    trainer = Trainer(**trainer_args)
    components.append(trainer)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        # Change this value to control caching of execution results. Default value
        # is `False`.
        enable_cache=True,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Exemple #10
0
def create_pipeline(
        prev_run_root: Text,
        run_root: Text,
        pipeline_name: Text,
        pipeline_mod: Text,
        query: Text,
        beam_pipeline_args: Optional[List[Text]] = None,
        metadata_path: Optional[Text] = None,
        custom_config: Optional[Dict[Text, Any]] = None) -> pipeline.Pipeline:
    """Implements the incremental pipeline.."""

    example_gen = BigQueryExampleGen(
        query=query,
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='train',
                                                  hash_buckets=20),
                example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
            ])))

    schema_importer = ImporterNode(instance_name='import_schema',
                                   source_uri=os.path.join(
                                       prev_run_root, 'serving/schema'),
                                   artifact_type=standard_artifacts.Schema,
                                   reimport=False)
    graph_importer = ImporterNode(
        instance_name='import_transform_graph',
        source_uri=os.path.join(prev_run_root, 'serving/transform_graph'),
        artifact_type=standard_artifacts.TransformGraph,
        reimport=False)
    model_importer = ImporterNode(instance_name='import_model',
                                  source_uri=os.path.join(
                                      prev_run_root, 'serving/model'),
                                  artifact_type=standard_artifacts.Model,
                                  reimport=False)

    # Performs transformations and feature engineering in training and serving.
    transform = TransformWithGraph(
        examples=example_gen.outputs['examples'],
        schema=schema_importer.outputs['result'],
        transform_graph=graph_importer.outputs['result'])

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_importer.outputs['result'],
        transform_graph=graph_importer.outputs['result'],
        train_args=trainer_pb2.TrainArgs(),
        eval_args=trainer_pb2.EvalArgs(),
        trainer_fn='{}.trainer_fn'.format(pipeline_mod),
        base_model=model_importer.outputs['result'],
        custom_config=custom_config)

    # Not depdent on blessing. Always pushes regardless of quality.
    pusher = AlwaysPusher(
        model=trainer.outputs['model'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(run_root, 'serving', 'model'))))

    schema_pusher = SchemaPusher(
        artifact=schema_importer.outputs['result'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(run_root, 'serving', 'schema'))),
        instance_name='schema_pusher')

    transform_graph_pusher = TransformGraphPusher(
        artifact=graph_importer.outputs['result'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(run_root, 'serving',
                                            'transform_graph'))),
        instance_name='transform_graph_pusher')

    pipeline_kwargs = {}
    if metadata_path is not None:
        pipeline_kwargs = {
            'metadata_connection_config':
            metadata.sqlite_metadata_connection_config(metadata_path),
        }

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=os.path.join(run_root, 'data'),
                             components=[
                                 example_gen, schema_importer, graph_importer,
                                 model_importer, transform, trainer, pusher,
                                 schema_pusher, transform_graph_pusher
                             ],
                             enable_cache=True,
                             beam_pipeline_args=beam_pipeline_args,
                             **pipeline_kwargs)
Exemple #11
0
def create_pipeline(
        run_root: Text,
        pipeline_name: Text,
        pipeline_mod: Text,
        examples_uri: Text,
        schema_uri: Text,
        transform_graph_uri: Text,
        model_uri: Text,
        beam_pipeline_args: Optional[List[Text]] = None,
        metadata_path: Optional[Text] = None,
        custom_config: Optional[Dict[Text, Any]] = None) -> pipeline.Pipeline:
    """Implements the incremental pipeline.."""

    schema_importer = ImporterNode(instance_name='schema_importer',
                                   source_uri=schema_uri,
                                   artifact_type=standard_artifacts.Schema,
                                   reimport=False)
    transform_graph_importer = ImporterNode(
        instance_name='transform_graph_importer',
        source_uri=transform_graph_uri,
        artifact_type=standard_artifacts.TransformGraph,
        reimport=False)
    examples_importer = ImporterNode(instance_name='examples_importer',
                                     source_uri=examples_uri,
                                     artifact_type=standard_artifacts.Examples,
                                     properties={
                                         'split_names':
                                         artifact_utils.encode_split_names(
                                             artifact.DEFAULT_EXAMPLE_SPLITS)
                                     },
                                     reimport=False)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        transformed_examples=examples_importer.outputs['result'],
        schema=schema_importer.outputs['result'],
        transform_graph=transform_graph_importer.outputs['result'],
        train_args=trainer_pb2.TrainArgs(),
        eval_args=trainer_pb2.EvalArgs(),
        trainer_fn='{}.trainer_fn'.format(pipeline_mod),
        custom_config=custom_config)

    # Not depdent on blessing. Always pushes regardless of quality.
    pusher = AlwaysPusher(model=trainer.outputs['model'],
                          push_destination=pusher_pb2.PushDestination(
                              filesystem=pusher_pb2.PushDestination.Filesystem(
                                  base_directory=model_uri)))

    pipeline_kwargs = {}
    if metadata_path is not None:
        pipeline_kwargs = {
            'metadata_connection_config':
            metadata.sqlite_metadata_connection_config(metadata_path),
        }

    return pipeline.Pipeline(pipeline_name=pipeline_name,
                             pipeline_root=os.path.join(run_root, 'data'),
                             components=[
                                 schema_importer, transform_graph_importer,
                                 examples_importer, trainer, pusher
                             ],
                             enable_cache=True,
                             beam_pipeline_args=beam_pipeline_args,
                             **pipeline_kwargs)