Beispiel #1
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
  """Implements the Iris flowers pipeline with TFX."""
  examples = external_input(data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, infer_schema, validate_stats, trainer,
          model_analyzer, model_validator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      # TODO(b/142684737): The multi-processing API might change.
      beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
  )
Beispiel #2
0
def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root: str,
                     module_file: str, serving_model_dir: str,
                     metadata_path: str) -> pipeline.Pipeline:
    """Creates a three component penguin pipeline with TFX."""
    # Brings data into the pipeline.
    example_gen = CsvExampleGen(input_base=data_root)

    # Uses user-provided Python function that trains a model.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=example_gen.outputs['examples'],
        train_args=trainer_pb2.TrainArgs(num_steps=100),
        eval_args=trainer_pb2.EvalArgs(num_steps=5))

    # Pushes the model to a filesystem destination.
    pusher = Pusher(model=trainer.outputs['model'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    # Following three components will be included in the pipeline.
    components = [
        example_gen,
        trainer,
        pusher,
    ]

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        components=components)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
  """Implements the Penguin pipeline with TFX."""
  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=data_root)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  schema_gen = SchemaGen(
      statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True)

  # Performs anomaly detection based on statistics and data schema.
  example_validator = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema'])

  # TODO(humichael): Handle applying transformation component in Milestone 3.

  # Uses user-provided Python function that trains a model using TF-Learn.
  # Num_steps is not provided during evaluation because the scikit-learn model
  # loads and evaluates the entire test set at once.
  # TODO(b/159470716): Make schema optional in Trainer.
  trainer = Trainer(
      module_file=module_file,
      custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      train_args=trainer_pb2.TrainArgs(num_steps=2000),
      eval_args=trainer_pb2.EvalArgs())

  # TODO(humichael): Add Evaluator once it's decided how to proceed with
  # Milestone 2.

  pusher = Pusher(
      model=trainer.outputs['model'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen,
          statistics_gen,
          schema_gen,
          example_validator,
          trainer,
          pusher,
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      beam_pipeline_args=beam_pipeline_args,
  )
Beispiel #4
0
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                    module_file: Text, serving_model_dir: Text,
                    metadata_path: Text,
                    direct_num_workers: int) -> pipeline.Pipeline:
    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))
    examples = tfrecord_input(data_root)
    example_gen = ImportExampleGen(input=examples, output_config=output)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=True)
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      transform_graph=transform.outputs['transform_graph'],
                      schema=infer_schema.outputs['schema'],
                      train_args=trainer_pb2.TrainArgs(num_steps=100),
                      eval_args=trainer_pb2.EvalArgs(num_steps=50))

    eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()])

    model_analyzer = Evaluator(examples=example_gen.outputs['examples'],
                               model=trainer.outputs['model'],
                               eval_config=eval_config)

    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_analyzer.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            infer_schema,
            validate_stats,
            transform,
            trainer,
            model_analyzer,
            model_validator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
Beispiel #5
0
 def _create_trainer(module_file, instance_name):
     return Trainer(module_file=module_file,
                    examples=transform.outputs['transformed_examples'],
                    transform_graph=transform.outputs['transform_graph'],
                    schema=schema_gen.outputs['schema'],
                    train_args=trainer_pb2.TrainArgs(num_steps=5000),
                    eval_args=trainer_pb2.EvalArgs(num_steps=100),
                    instance_name=instance_name)
Beispiel #6
0
 def _create_trainer(module_file, component_id):
     return Trainer(module_file=module_file,
                    examples=transform.outputs['transformed_examples'],
                    transform_graph=transform.outputs['transform_graph'],
                    schema=schema_gen.outputs['schema'],
                    train_args=trainer_pb2.TrainArgs(num_steps=5000),
                    eval_args=trainer_pb2.EvalArgs(
                        num_steps=100)).with_id(component_id)
 def _create_trainer(module_file, instance_name):
     return Trainer(module_file=module_file,
                    custom_executor_spec=executor_spec.ExecutorClassSpec(
                        GenericExecutor),
                    examples=transform.outputs['transformed_examples'],
                    transform_graph=transform.outputs['transform_graph'],
                    schema=schema_gen.outputs['schema'],
                    train_args=trainer_pb2.TrainArgs(num_steps=5000),
                    eval_args=trainer_pb2.EvalArgs(num_steps=100),
                    instance_name=instance_name)
Beispiel #8
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the imdb sentiment analysis pipline with TFX."""
    examples = tfrecord_input(data_root)
    # Brings data in to the pipline
    example_gen = ImportExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=200),
        eval_args=trainer_pb2.EvalArgs(num_steps=100))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            transform,
            trainer,
            #model_resolver,
            #valuator,
            #pusher,
        ],
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        enable_cache=True,
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )
def create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_inputs(os.path.join(base_dir, 'no_split/span_1'))

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_data=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=taxi_pipeline_utils)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=taxi_pipeline_utils,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_steps=10000,
        eval_steps=5000,
        warm_starting=True)

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(examples=example_gen.outputs.examples,
                               model_exports=trainer.outputs.output)

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    serving_model_dir=serving_model_dir)

    return [
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
    ]
Beispiel #10
0
def create_test_pipeline():
    """Creates a sample pipeline with ForEach context."""

    example_gen = CsvExampleGen(input_base='/data/mydummy_dataset')

    with for_each.ForEach(example_gen.outputs['examples']) as each_example:
        statistics_gen = StatisticsGen(examples=each_example)

    latest_stats_resolver = resolver.Resolver(
        statistics=statistics_gen.outputs['statistics'],
        strategy_class=latest_artifact_strategy.LatestArtifactStrategy,
    ).with_id('latest_stats_resolver')

    schema_gen = SchemaGen(
        statistics=latest_stats_resolver.outputs['statistics'])

    with for_each.ForEach(example_gen.outputs['examples']) as each_example:
        trainer = Trainer(
            module_file='/src/train.py',
            examples=each_example,
            schema=schema_gen.outputs['schema'],
            train_args=trainer_pb2.TrainArgs(num_steps=2000),
        )

    with for_each.ForEach(trainer.outputs['model']) as each_model:
        pusher = Pusher(
            model=each_model,
            push_destination=pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory='/models')),
        )

    return pipeline.Pipeline(
        pipeline_name='foreach',
        pipeline_root='/tfx/pipelines/foreach',
        components=[
            example_gen,
            statistics_gen,
            latest_stats_resolver,
            schema_gen,
            trainer,
            pusher,
        ],
        enable_cache=True,
        execution_mode=pipeline.ExecutionMode.SYNC,
    )
Beispiel #11
0
def create_pipe():

    instance = ExternalArtifact()
    instance.uri = str(CSV_PATH)

    channels = channel_utils.as_channel([instance])

    example_gen = CsvExampleGen(input=channels)

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'])

    example_val = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file='/root/airflow/dags/infer.py')

    trainer = Trainer(
        module_file='/root/airflow/dags/infer.py',
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=200),
        eval_args=trainer_pb2.EvalArgs(num_steps=100))

    # Database config
    metadata_config = metadata.sqlite_metadata_connection_config('teste_1.db')

    pipe = pipeline.Pipeline(
        pipeline_name='teste_1',
        pipeline_root='/root/',
        components=[
            example_gen, statistics_gen, infer_schema, example_val, transform,
            trainer
        ],
        metadata_connection_config=metadata_config,
    )

    return pipe
Beispiel #12
0
    def build(self, context: Context) -> BaseNode:
        from tfx.components import Trainer

        if context.has(self._config.inputs.transform_graph):
            transform_graph = context.get(self._config.inputs.transform_graph)
        else:
            transform_graph = None

        component = Trainer(
            examples=context.get(self._config.inputs.examples),
            schema=context.get(self._config.inputs.schema),
            transform_graph=transform_graph,
            train_args=serialize_dict(self._config.params.train_args),
            eval_args=serialize_dict(self._config.params.eval_args),
            module_file=self._config.params.module_file,
            instance_name=context.abs_current_url_friendly)

        put_outputs_to_context(context, self._config.outputs, component)
        return component
Beispiel #13
0
def generate_pipeline(pipeline_name, pipeline_root, data_root, train_steps,
                      eval_steps):
    examples = external_input(data_root)
    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    trainer = Trainer(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file='util.py',  # util.py is a file in the same folder
        train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
        eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[example_gen, statistics_gen, schema_gen, trainer],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            os.path.join(pipeline_root, 'metadata.sqlite')))
Beispiel #14
0
def _create_pipeline(pipeline_name: str, pipeline_root: str, module_file: str,
                     presto_config: presto_config_pb2.PrestoConnConfig,
                     query: str, serving_model_dir: str,
                     metadata_path: str) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    # Brings data into the pipeline or otherwise joins/converts training data
    example_gen = PrestoExampleGen(presto_config, query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, evaluator, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
    )
Beispiel #15
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # query: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
    ai_platform_training_args: Optional[Dict[Text, Text]] = None,
    ai_platform_serving_args: Optional[Dict[Text, Any]] = None,
) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""

    components = []

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=external_input(data_path))
    # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source.
    # example_gen = BigQueryExampleGen(query=query)
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline.
    # components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    # TODO(step 5): Uncomment here to add SchemaGen to the pipeline.
    # components.append(schema_gen)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    # TODO(step 5): Uncomment here to add ExampleValidator to the pipeline.
    # components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          preprocessing_fn=preprocessing_fn)
    # TODO(step 6): Uncomment here to add Transform to the pipeline.
    # components.append(transform)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer_args = {
        'run_fn':
        run_fn,
        'transformed_examples':
        transform.outputs['transformed_examples'],
        'schema':
        schema_gen.outputs['schema'],
        'transform_graph':
        transform.outputs['transform_graph'],
        'train_args':
        train_args,
        'eval_args':
        eval_args,
        'custom_executor_spec':
        executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor),
    }
    if ai_platform_training_args is not None:
        trainer_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_trainer_executor.GenericExecutor),
            'custom_config': {
                ai_platform_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args,
            }
        })
    trainer = Trainer(**trainer_args)
    # TODO(step 6): Uncomment here to add Trainer to the pipeline.
    # components.append(trainer)

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))
    # TODO(step 6): Uncomment here to add ResolverNode to the pipeline.
    # components.append(model_resolver)

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='tips')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': eval_accuracy_threshold}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)
    # TODO(step 6): Uncomment here to add Evaluator to the pipeline.
    # components.append(evaluator)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher_args = {
        'model':
        trainer.outputs['model'],
        'model_blessing':
        evaluator.outputs['blessing'],
        'push_destination':
        pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=serving_model_dir)),
    }
    if ai_platform_serving_args is not None:
        pusher_args.update({
            'custom_executor_spec':
            executor_spec.ExecutorClassSpec(
                ai_platform_pusher_executor.Executor),
            'custom_config': {
                ai_platform_pusher_executor.SERVING_ARGS_KEY:
                ai_platform_serving_args
            },
        })
    pusher = Pusher(**pusher_args)  # pylint: disable=unused-variable
    # TODO(step 6): Uncomment here to add Pusher to the pipeline.
    # components.append(pusher)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        # TODO(step 8): Change this value to control caching of execution results.
        enable_cache=True,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Beispiel #16
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     training_data_root: Text, inference_data_root: Text,
                     module_file: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX."""
  training_examples = external_input(training_data_root)

  # Brings training data into the pipeline or otherwise joins/converts
  # training data.
  training_example_gen = CsvExampleGen(
      input_base=training_examples, instance_name='training_example_gen')

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(
      input_data=training_example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=training_example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=training_example_gen.outputs['examples'],
      model_exports=trainer.outputs['model'],
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=training_example_gen.outputs['examples'],
      model=trainer.outputs['model'])

  inference_examples = external_input(inference_data_root)

  # Brings inference data into the pipeline.
  inference_example_gen = CsvExampleGen(
      input_base=inference_examples,
      output_config=example_gen_pb2.Output(
          split_config=example_gen_pb2.SplitConfig(
              splits=[example_gen_pb2.SplitConfig.Split(
                  name='unlabelled', hash_buckets=100)])),
      instance_name='inference_example_gen')

  # Performs offline batch inference over inference examples.
  bulk_inferrer = BulkInferrer(
      examples=inference_example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      # Empty data_spec.example_splits will result in using all splits.
      data_spec=bulk_inferrer_pb2.DataSpec(),
      model_spec=bulk_inferrer_pb2.ModelSpec())

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          training_example_gen, inference_example_gen, statistics_gen,
          infer_schema, validate_stats, transform, trainer, model_analyzer,
          model_validator, bulk_inferrer
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      # TODO(b/141578059): The multi-processing API might change.
      beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
Beispiel #17
0
def generate_pipeline(pipeline_name, pipeline_root, train_data, test_data,
                      train_steps, eval_steps, pusher_target, runner):
    module_file = 'util.py'  # util.py is a file in the same folder

    # RuntimeParameter is only supported on KubeflowDagRunner currently
    if runner == 'kubeflow':
        pipeline_root_param = os.path.join('gs://{{kfp-default-bucket}}',
                                           pipeline_name, '{{workflow.uid}}')
        train_data_param = data_types.RuntimeParameter(
            name='train-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/train',
            ptype=Text)
        test_data_param = data_types.RuntimeParameter(
            name='test-data',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/test',
            ptype=Text)
        pusher_target_param = data_types.RuntimeParameter(
            name='pusher-destination',
            default=
            'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/serving',
            ptype=Text)
    else:
        pipeline_root_param = pipeline_root
        train_data_param = train_data
        test_data_param = test_data
        pusher_target_param = pusher_target

    examples = external_input(train_data_param)
    example_gen = CsvExampleGen(input=examples, instance_name="train")

    test_examples = external_input(test_data_param)
    test_example_gen = CsvExampleGen(input=test_examples,
                                     output_config={
                                         'split_config': {
                                             'splits': [{
                                                 'name': 'test',
                                                 'hash_buckets': 1
                                             }]
                                         }
                                     },
                                     instance_name="test")

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True
                           )  # infer_feature_shape controls sparse or dense

    # Transform is too slow in my side.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=module_file,
        train_args=trainer_pb2.TrainArgs(num_steps=train_steps),
        eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps),
        instance_name="train",
        enable_cache=False)

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='target')],
        # tfma.SlicingSpec(feature_keys=['var_0', 'var_1']) when add more, Evaluator can't ouptput BLESSED status. It should be a bug in TFMA.
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.4}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config,
        instance_name="eval5")

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination={
                        'filesystem': {
                            'base_directory': pusher_target_param
                        }
                    })

    bulk_inferrer = BulkInferrer(
        examples=test_example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        # model_blessing=evaluator.outputs['blessing'],
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec(),
        instance_name="bulkInferrer")

    hello = component.HelloComponent(
        input_data=bulk_inferrer.outputs['inference_result'],
        instance_name='csvGen')

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root_param,
        components=[
            example_gen, statistics_gen, schema_gen, transform, trainer,
            model_resolver, evaluator, pusher, hello, test_example_gen,
            bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            os.path.join(pipeline_root, 'metadata.sqlite')),
        beam_pipeline_args=['--direct_num_workers=0'])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the imdb sentiment analysis pipline with TFX."""
    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=9),
            example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1)
        ]))

    # Brings data in to the pipline
    example_gen = CsvExampleGen(input_base=data_root, output_config=output)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model.
    trainer = Trainer(module_file=module_file,
                      examples=transform.outputs['transformed_examples'],
                      transform_graph=transform.outputs['transform_graph'],
                      schema=schema_gen.outputs['schema'],
                      train_args=trainer_pb2.TrainArgs(num_steps=500),
                      eval_args=trainer_pb2.EvalArgs(num_steps=200))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='label')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            # Increase this threshold when training on complete
                            # dataset.
                            lower_bound={'value': 0.01}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-2})))
            ])
        ])

    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        trainer,
        model_resolver,
        evaluator,
        pusher,
    ]
    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        enable_cache=True,
        beam_pipeline_args=beam_pipeline_args)
Beispiel #19
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text,
                     training_data_root: Text, inference_data_root: Text,
                     module_file: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    # Brings training data into the pipeline or otherwise joins/converts
    # training data.
    training_example_gen = CsvExampleGen(input_base=training_data_root,
                                         instance_name='training_example_gen')

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(
        input_data=training_example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=training_example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(examples=training_example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    # Brings inference data into the pipeline.
    inference_example_gen = CsvExampleGen(
        input_base=inference_data_root,
        output_config=example_gen_pb2.Output(
            split_config=example_gen_pb2.SplitConfig(splits=[
                example_gen_pb2.SplitConfig.Split(name='unlabelled',
                                                  hash_buckets=100)
            ])),
        instance_name='inference_example_gen')

    # Performs offline batch inference over inference examples.
    bulk_inferrer = BulkInferrer(
        examples=inference_example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        # Empty data_spec.example_splits will result in using all splits.
        data_spec=bulk_inferrer_pb2.DataSpec(),
        model_spec=bulk_inferrer_pb2.ModelSpec())

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            training_example_gen, inference_example_gen, statistics_gen,
            schema_gen, example_validator, transform, trainer, model_resolver,
            evaluator, bulk_inferrer
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Beispiel #20
0
def create_pipeline(
    pipeline_name: Text,
    pipeline_root: Text,
    data_path: Text,
    preprocessing_fn: Text,
    run_fn: Text,
    train_args: trainer_pb2.TrainArgs,
    eval_args: trainer_pb2.EvalArgs,
    eval_accuracy_threshold: float,
    serving_model_dir: Text,
    metadata_connection_config: Optional[
        metadata_store_pb2.ConnectionConfig] = None,
    beam_pipeline_args: Optional[List[Text]] = None,
) -> pipeline.Pipeline:
    """Implements the penguin pipeline with TFX."""

    components = []

    # Brings data into the pipeline or otherwise joins/converts training data.
    # TODO(step 2): Might use another ExampleGen class for your data.
    example_gen = CsvExampleGen(input_base=data_path)
    components.append(example_gen)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    components.append(statistics_gen)

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)
    components.append(schema_gen)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(  # pylint: disable=unused-variable
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    components.append(example_validator)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(  # pylint: disable=unused-variable
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        preprocessing_fn=preprocessing_fn)
    # TODO(step 3): Uncomment here to add Transform to the pipeline.
    # components.append(transform)

    # Uses user-provided Python function that implements a model using Tensorflow.
    trainer = Trainer(
        run_fn=run_fn,
        examples=example_gen.outputs['examples'],
        # Use outputs of Transform as training inputs if Transform is used.
        # examples=transform.outputs['transformed_examples'],
        # transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=train_args,
        eval_args=eval_args)
    # TODO(step 4): Uncomment here to add Trainer to the pipeline.
    # components.append(trainer)

    # Get the latest blessed model for model validation.
    model_resolver = resolver.Resolver(
        strategy_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(
            type=ModelBlessing)).with_id('latest_blessed_model_resolver')
    # TODO(step 5): Uncomment here to add Resolver to the pipeline.
    # components.append(model_resolver)

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key=features.LABEL_KEY)],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': eval_accuracy_threshold}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(  # pylint: disable=unused-variable
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)
    # TODO(step 5): Uncomment here to add Evaluator to the pipeline.
    # components.append(evaluator)

    # Pushes the model to a file destination if check passed.
    pusher = Pusher(  # pylint: disable=unused-variable
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=serving_model_dir)))
    # TODO(step 5): Uncomment here to add Pusher to the pipeline.
    # components.append(pusher)

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components,
        # Change this value to control caching of execution results. Default value
        # is `False`.
        # enable_cache=True,
        metadata_connection_config=metadata_connection_config,
        beam_pipeline_args=beam_pipeline_args,
    )
Beispiel #21
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                             infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=infer_schema.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=infer_schema.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs['examples'],
        model_exports=trainer.outputs['model'],
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs['examples'],
                                     model=trainer.outputs['model'])

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=model_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # LINT.IfChange
        beam_pipeline_args=[
            # ----- Beam Args -----.
            '--runner=PortableRunner',
            # Points to the job server started in
            # setup_beam_on_(flink|spark).sh
            '--job_endpoint=localhost:8099',
            '--environment_type=LOOPBACK',
            # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment.  # pylint: disable=g-bad-todo
            # TODO(BEAM-5167): Use concurrency information from SDK Harness.  # pylint: disable=g-bad-todo
            # Note; We use 100 worker threads to mitigate the issue with
            # scheduling work between the Beam runner and SDK harness. Flink
            # and Spark can process unlimited work items concurrently while
            # SdkHarness can only process 1 work item per worker thread.
            # Having 100 threads will let 100 tasks execute concurrently
            # avoiding scheduling issue in most cases. In case the threads are
            # exhausted, beam print the relevant message in the log.
            '--experiments=worker_threads=100',
            # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all.  # pylint: disable=g-bad-todo
            '--experiments=pre_optimize=all',
            # ----- Flink runner-specific Args -----.
            # TODO(b/126725506): Set the task parallelism based on cpu cores.
            # TODO(FLINK-10672): Obviate setting BATCH_FORCED.
            '--execution_mode_for_batch=BATCH_FORCED',
        ],
        # LINT.ThenChange(setup/setup_beam_on_spark.sh)
        # LINT.ThenChange(setup/setup_beam_on_flink.sh)
    )
Beispiel #22
0
def create_e2e_components(
    pipeline_root: Text,
    csv_input_location: Text,
    transform_module: Text,
    trainer_module: Text,
) -> List[BaseComponent]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """
    example_gen = CsvExampleGen(input_base=csv_input_location)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'])
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=transform_module)
    latest_model_resolver = resolver.Resolver(
        strategy_class=latest_artifact_strategy.LatestArtifactStrategy,
        latest_model=Channel(type=Model)).with_id('latest_model_resolver')
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['latest_model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config)

    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            kubernetes=infra_validator_pb2.KubernetesConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        latest_model_resolver,
        trainer,
        evaluator,
        infra_validator,
        pusher,
    ]
 def testTaxiPipelineNewStyleCompatibility(self):
     examples = external_input('/tmp/fake/path')
     example_gen = CsvExampleGen(input=examples)
     self.assertIs(example_gen.inputs['input'],
                   example_gen.inputs['input_base'])
     statistics_gen = StatisticsGen(
         examples=example_gen.outputs['examples'])
     self.assertIs(statistics_gen.inputs['examples'],
                   statistics_gen.inputs['input_data'])
     infer_schema = SchemaGen(
         statistics=statistics_gen.outputs['statistics'])
     self.assertIs(infer_schema.inputs['statistics'],
                   infer_schema.inputs['stats'])
     self.assertIs(infer_schema.outputs['schema'],
                   infer_schema.outputs['output'])
     validate_examples = ExampleValidator(
         statistics=statistics_gen.outputs['statistics'],
         schema=infer_schema.outputs['schema'])
     self.assertIs(validate_examples.inputs['statistics'],
                   validate_examples.inputs['stats'])
     self.assertIs(validate_examples.outputs['anomalies'],
                   validate_examples.outputs['output'])
     transform = Transform(examples=example_gen.outputs['examples'],
                           schema=infer_schema.outputs['schema'],
                           module_file='/tmp/fake/module/file')
     self.assertIs(transform.inputs['examples'],
                   transform.inputs['input_data'])
     self.assertIs(transform.outputs['transform_graph'],
                   transform.outputs['transform_output'])
     trainer = Trainer(
         module_file='/tmp/fake/module/file',
         transformed_examples=transform.outputs['transformed_examples'],
         schema=infer_schema.outputs['schema'],
         transform_graph=transform.outputs['transform_graph'],
         train_args=trainer_pb2.TrainArgs(num_steps=10000),
         eval_args=trainer_pb2.EvalArgs(num_steps=5000))
     self.assertIs(trainer.inputs['transform_graph'],
                   trainer.inputs['transform_output'])
     self.assertIs(trainer.outputs['model'], trainer.outputs['output'])
     evaluator = Evaluator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'],
         feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
             evaluator_pb2.SingleSlicingSpec(
                 column_for_slicing=['trip_start_hour'])
         ]))
     self.assertIs(evaluator.inputs['model'],
                   evaluator.inputs['model_exports'])
     self.assertIs(evaluator.outputs['evaluation'],
                   evaluator.outputs['output'])
     model_validator = ModelValidator(
         examples=example_gen.outputs['examples'],
         model=trainer.outputs['model'])
     pusher = Pusher(model=trainer.outputs['output'],
                     model_blessing=model_validator.outputs['blessing'],
                     push_destination=pusher_pb2.PushDestination(
                         filesystem=pusher_pb2.PushDestination.Filesystem(
                             base_directory='/fake/serving/dir')))
     self.assertIs(pusher.inputs['model'], pusher.inputs['model_export'])
     self.assertIs(pusher.outputs['pushed_model'],
                   pusher.outputs['model_push'])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     trainer_module_file: Text, evaluator_module_file: Text,
                     serving_model_dir: Text, metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
    """Implements the Penguin pipeline with TFX."""
    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=data_root)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # TODO(humichael): Handle applying transformation component in Milestone 3.

    # Uses user-provided Python function that trains a model using TF-Learn.
    # Num_steps is not provided during evaluation because the scikit-learn model
    # loads and evaluates the entire test set at once.
    # TODO(b/159470716): Make schema optional in Trainer.
    trainer = Trainer(
        module_file=trainer_module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        eval_args=trainer_pb2.EvalArgs())

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='species')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='Accuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(module_file=evaluator_module_file,
                          examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          baseline_model=model_resolver.outputs['model'],
                          eval_config=eval_config)

    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            trainer,
            model_resolver,
            evaluator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_pipeline_args,
    )
Beispiel #25
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX."""
  examples = external_input(data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  hello = HelloComponent(
      input_data=example_gen.outputs['examples'], name='HelloWorld')

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=hello.outputs['output_data'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=hello.outputs['output_data'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=hello.outputs['output_data'],
      model=trainer.outputs['model'],
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=hello.outputs['output_data'], model=trainer.outputs['model'])

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, hello, statistics_gen, infer_schema, validate_stats,
          transform, trainer, model_analyzer, model_validator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path))
def init_components(data_dir,
                    module_file,
                    serving_model_dir=None,
                    ai_platform_training_args=None,
                    ai_platform_serving_args=None,
                    training_steps=1000,
                    eval_steps=200):
    """
    This function is to initialize tfx components
    """

    if serving_model_dir and ai_platform_serving_args:
        raise NotImplementedError(
            "Can't set ai_platform_serving_args and serving_model_dir at "
            "the same time. Choose one deployment option.")

    output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig(
        splits=[
            example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=99),
            example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1),
        ]))

    example_gen = CsvExampleGen(input_base=data_dir, output_config=output)

    statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs["statistics"],
        infer_feature_shape=False,
    )

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs["statistics"],
        schema=schema_gen.outputs["schema"],
    )

    transform = Transform(
        examples=example_gen.outputs["examples"],
        schema=schema_gen.outputs["schema"],
        module_file=module_file,
    )

    training_kwargs = {
        "module_file": module_file,
        "examples": transform.outputs["transformed_examples"],
        "schema": schema_gen.outputs["schema"],
        "transform_graph": transform.outputs['transform_graph'],
        "train_args": trainer_pb2.TrainArgs(num_steps=training_steps),
        "eval_args": trainer_pb2.EvalArgs(num_steps=eval_steps),
    }

    if ai_platform_training_args:

        training_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(
                aip_trainer_executor.GenericExecutor),
            "custom_config": {
                aip_trainer_executor.TRAINING_ARGS_KEY:
                ai_platform_training_args  # noqa
            },
        })
    else:
        training_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(GenericExecutor)
        })

    trainer = Trainer(**training_kwargs)

    model_resolver = ResolverNode(
        instance_name="latest_blessed_model_resolver",
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing),
    )

    #model_resolver for tfx==0.30.0
    # model_resolver = tfx.dsl.Resolver(
    #   strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy,
    #   model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),
    #   model_blessing=tfx.dsl.Channel(
    #       type=tfx.types.standard_artifacts.ModelBlessing)).with_id(
    #           'latest_blessed_model_resolver')

    #the book's eval_config might be wrong,
    #threshold has to be set within the tfma.MetricConfig() with each metric
    #this seems to have caused the models not be blessed
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key="consumer_disputed")],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=["product"]),
        ],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(class_name='ExampleCount'),
                tfma.MetricConfig(
                    class_name='BinaryAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={"value": 0.01},
                        ),
                    )),
                # tfma.MetricConfig(
                #     class_name='AUC',
                #     threshold=tfma.MetricThreshold(
                #         value_threshold=tfma.GenericValueThreshold(
                #             lower_bound={'value': 0.5}
                #             ),
                #         change_threshold=tfma.GenericChangeThreshold(
                #             direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                #             absolute={"value": 0.01},
                #         ),
                #         )
                #     ),
            ])
        ],
    )

    evaluator = Evaluator(
        examples=example_gen.outputs["examples"],
        model=trainer.outputs["model"],
        # baseline_model=model_resolver.outputs["model"],
        eval_config=eval_config,
    )

    pusher_kwargs = {
        "model": trainer.outputs["model"],
        "model_blessing": evaluator.outputs["blessing"],
    }

    if ai_platform_serving_args:

        pusher_kwargs.update({
            "custom_executor_spec":
            executor_spec.ExecutorClassSpec(aip_pusher_executor.Executor),
            "custom_config": {
                aip_pusher_executor.SERVING_ARGS_KEY:
                ai_platform_serving_args  # noqa
            },
        })
    elif serving_model_dir:
        pusher_kwargs.update({
            "push_destination":
            pusher_pb2.PushDestination(
                filesystem=pusher_pb2.PushDestination.Filesystem(
                    base_directory=serving_model_dir))
        })
    else:
        raise NotImplementedError(
            "Provide ai_platform_serving_args or serving_model_dir.")

    pusher = Pusher(**pusher_kwargs)

    #compile all components in a list
    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        trainer,
        model_resolver,
        evaluator,
        pusher,
    ]
    return components
Beispiel #27
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     beam_pipeline_args: List[Text]) -> pipeline.Pipeline:
  """Implements the chicago taxi pipeline with TFX."""
  #examples = external_input(data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  #example_gen = CsvExampleGen(input=examples)
  example_gen = CsvExampleGen(input_base=data_root)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  schema_gen = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)

  # Performs anomaly detection based on statistics and data schema.
  example_validator = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema'])

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      module_file=module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=module_file,
      transformed_examples=transform.outputs['transformed_examples'],
      schema=schema_gen.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Get the latest blessed model for model validation.
  model_resolver = ResolverNode(
      instance_name='latest_blessed_model_resolver',
      resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
      model=Channel(type=Model),
      model_blessing=Channel(type=ModelBlessing))

  # Uses TFMA to compute a evaluation statistics over features of a model and
  # perform quality validation of a candidate model (compared to a baseline).
  eval_config = tfma.EvalConfig(
      model_specs=[tfma.ModelSpec(signature_name='eval')],
      slicing_specs=[
          tfma.SlicingSpec(),
          tfma.SlicingSpec(feature_keys=['trip_start_hour'])
      ],
      metrics_specs=[
          tfma.MetricsSpec(
              thresholds={
                  'accuracy':
                      tfma.config.MetricThreshold(
                          value_threshold=tfma.GenericValueThreshold(
                              lower_bound={'value': 0.6}),
                          change_threshold=tfma.GenericChangeThreshold(
                              direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                              absolute={'value': -1e-10}))
              })
      ])
  evaluator = Evaluator(
      examples=example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      baseline_model=model_resolver.outputs['model'],
      # Change threshold will be ignored if there is no baseline (first run).
      eval_config=eval_config)

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=evaluator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=serving_model_dir)))

  return pipeline.Pipeline(
      pipeline_name=pipeline_name,
      pipeline_root=pipeline_root,
      components=[
          example_gen, statistics_gen, schema_gen, example_validator, transform,
          trainer, model_resolver, evaluator, pusher
      ],
      enable_cache=True,
      metadata_connection_config=metadata.sqlite_metadata_connection_config(
          metadata_path),
      beam_pipeline_args=beam_pipeline_args)
Beispiel #28
0
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text,
                     module_file: Text, serving_model_dir: Text,
                     metadata_path: Text,
                     direct_num_workers: int) -> pipeline.Pipeline:
    """Implements the Iris flowers pipeline with TFX."""
    examples = external_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=True)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Uses user-provided Python function that trains a model using TF-Learn.
    trainer = Trainer(
        module_file=module_file,
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5))

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute an evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(label_key='variety')],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Performs infra validation of a candidate model to prevent unservable model
    # from being pushed. This config will launch a model server of the latest
    # TensorFlow Serving image in a local docker engine.
    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            local_docker=infra_validator_pb2.LocalDockerConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    infra_blessing=infra_validator.outputs['blessing'],
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen,
            statistics_gen,
            schema_gen,
            example_validator,
            transform,
            trainer,
            model_resolver,
            evaluator,
            infra_validator,
            pusher,
        ],
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        # TODO(b/142684737): The multi-processing API might change.
        beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers],
    )
def create_pipeline(
        pipeline_name: Text,
        pipeline_root: Text,
        module_file: Text,
        ai_platform_training_args: Dict[Text, Text],
        ai_platform_serving_args: Dict[Text, Text],
        beam_pipeline_args: Optional[List[Text]] = None) -> pipeline.Pipeline:
    """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.

  Args:
    pipeline_name: name of the TFX pipeline being created.
    pipeline_root: root directory of the pipeline. Should be a valid GCS path.
    module_file: uri of the module files used in Trainer and Transform
      components.
    ai_platform_training_args: Args of CAIP training job. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job
      for detailed description.
    ai_platform_serving_args: Args of CAIP model deployment. Please refer to
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.models
      for detailed description.
    beam_pipeline_args: Optional list of beam pipeline options. Please refer to
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options.
      When this argument is not provided, the default is to use GCP
      DataflowRunner with 50GB disk size as specified in this function. If an
      empty list is passed in, default specified by Beam will be used, which can
      be found at
      https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options

  Returns:
    A TFX pipeline object.
  """

    # The rate at which to sample rows from the Taxi dataset using BigQuery.
    # The full taxi dataset is > 200M record.  In the interest of resource
    # savings and time, we've set the default for this example to be much smaller.
    # Feel free to crank it up and process the full dataset!
    # By default it generates a 0.1% random sample.
    query_sample_rate = data_types.RuntimeParameter(name='query_sample_rate',
                                                    ptype=float,
                                                    default=0.001)

    # This is the upper bound of FARM_FINGERPRINT in Bigquery (ie the max value of
    # signed int64).
    max_int64 = '0x7FFFFFFFFFFFFFFF'

    # The query that extracts the examples from BigQuery. The Chicago Taxi dataset
    # used for this example is a public dataset available on Google AI Platform.
    # https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips
    query = """
          SELECT
            pickup_community_area,
            fare,
            EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month,
            EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour,
            EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day,
            UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp,
            pickup_latitude,
            pickup_longitude,
            dropoff_latitude,
            dropoff_longitude,
            trip_miles,
            pickup_census_tract,
            dropoff_census_tract,
            payment_type,
            company,
            trip_seconds,
            dropoff_community_area,
            tips
          FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
          WHERE (ABS(FARM_FINGERPRINT(unique_key)) / {max_int64})
            < {query_sample_rate}""".format(
        max_int64=max_int64, query_sample_rate=str(query_sample_rate))

    # Beam args to run data processing on DataflowRunner.
    # TODO(b/151114974): Remove `disk_size_gb` flag after default is increased.
    # TODO(b/151116587): Remove `shuffle_mode` flag after default is changed.
    if beam_pipeline_args is None:
        beam_pipeline_args = [
            '--runner=DataflowRunner',
            '--experiments=shuffle_mode=auto',
            '--project=' + _project_id,
            '--temp_location=' + os.path.join(_output_bucket, 'tmp'),
            '--region=' + _gcp_region,
            '--disk_size_gb=50',
        ]

    # Number of epochs in training.
    train_steps = data_types.RuntimeParameter(
        name='train_steps',
        default=10000,
        ptype=int,
    )

    # Number of epochs in evaluation.
    eval_steps = data_types.RuntimeParameter(
        name='eval_steps',
        default=5000,
        ptype=int,
    )

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = BigQueryExampleGen(query=query)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)

    # Performs anomaly detection based on statistics and data schema.
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=module_file)

    # Update ai_platform_training_args if distributed training was enabled.
    # Number of worker machines used in distributed training.
    worker_count = data_types.RuntimeParameter(
        name='worker_count',
        default=2,
        ptype=int,
    )

    # Type of worker machines used in distributed training.
    worker_type = data_types.RuntimeParameter(
        name='worker_type',
        default='standard',
        ptype=str,
    )

    local_training_args = copy.deepcopy(ai_platform_training_args)

    if FLAGS.distributed_training:
        local_training_args.update({
            # You can specify the machine types, the number of replicas for workers
            # and parameter servers.
            # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#ScaleTier
            'scaleTier': 'CUSTOM',
            'masterType': 'large_model',
            'workerType': worker_type,
            'parameterServerType': 'standard',
            'workerCount': worker_count,
            'parameterServerCount': 1
        })

    # Uses user-provided Python function that implements a model using TF-Learn
    # to train a model on Google Cloud AI Platform.
    trainer = Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(
            ai_platform_trainer_executor.Executor),
        module_file=module_file,
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        transform_graph=transform.outputs['transform_graph'],
        train_args={'num_steps': train_steps},
        eval_args={'num_steps': eval_steps},
        custom_config={
            ai_platform_trainer_executor.TRAINING_ARGS_KEY: local_training_args
        })

    # Get the latest blessed model for model validation.
    model_resolver = ResolverNode(
        instance_name='latest_blessed_model_resolver',
        resolver_class=latest_blessed_model_resolver.
        LatestBlessedModelResolver,
        model=Channel(type=Model),
        model_blessing=Channel(type=ModelBlessing))

    # Uses TFMA to compute a evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    'binary_accuracy':
                    tfma.config.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ])
    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        # Change threshold will be ignored if there is no baseline (first run).
        eval_config=eval_config)

    # Checks whether the model passed the validation steps and pushes the model
    # to  Google Cloud AI Platform if check passed.
    pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec(
        ai_platform_pusher_executor.Executor),
                    model=trainer.outputs['model'],
                    model_blessing=evaluator.outputs['blessing'],
                    custom_config={
                        ai_platform_pusher_executor.SERVING_ARGS_KEY:
                        ai_platform_serving_args
                    })

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, example_validator,
            transform, trainer, model_resolver, evaluator, pusher
        ],
        beam_pipeline_args=beam_pipeline_args,
    )
def build_pipeline(timestamp: str) -> pipeline:
    """
    Gather tfx components and produce the output pipeline
    """

    conf['beam']['serving_model_dir'] = f"{conf['beam']['serving_model_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['pipeline_root_dir'] = f"{conf['beam']['pipeline_root_dir']}/beam/OL{653374}/{timestamp}"
    conf['beam']['metadata_path'] = f"{conf['beam']['metadata_path']}/beam/OL{653374}"

    logging.info("Serving model dir is now %s",conf['beam']['serving_model_dir'])

    example_gen = ImportExampleGen(input_base=conf['train_data'])

    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    schema_gen = SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False
    )
    
    transform = Transform(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file']
    )

    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema']
    )

    trainer = Trainer(
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        module_file=conf['trainer_module_file'],
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), # define this to use run_fn instead of trainer_fn
        train_args=trainer_pb2.TrainArgs(num_steps=conf['train_args_steps']),
        eval_args=trainer_pb2.EvalArgs(num_steps=50)
    )

    metrics = [
        tfma.metrics.ExampleCount(name='example_count'),
        tfma.metrics.WeightedExampleCount(name='weighted_example_count'),
        tf.keras.metrics.BinaryCrossentropy(name='binary_crossentropy'),
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(name='auc', num_thresholds=10),
        tf.keras.metrics.AUC(
            name='auc_precision_recall', curve='PR', num_thresholds=100),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tfma.metrics.MeanLabel(name='mean_label'),
        tfma.metrics.MeanPrediction(name='mean_prediction'),
        tfma.metrics.Calibration(name='calibration'),
        tfma.metrics.ConfusionMatrixPlot(name='confusion_matrix_plot'),
        tfma.metrics.CalibrationPlot(name='calibration_plot')
    ]
    my_metrics_specs = tfma.metrics.specs_from_metrics(metrics)

    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(label_key='label')
        ],
        metrics_specs=my_metrics_specs
        # [
            # tfma.MetricsSpec(
                # metrics=[
                #     # tfma.MetricConfig(class_name='ExampleCount'),
                #     tfma.MetricConfig(class_name='BinaryAccuracy',
                #       threshold=tfma.MetricThreshold(
                #           value_threshold=tfma.GenericValueThreshold(
                #               lower_bound={'value': 0.5}),
                #           change_threshold=tfma.GenericChangeThreshold(
                #               direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                #               absolute={'value': -1e-10})))
                # ]
            # )
        # ],
        ,
        slicing_specs=[
            tfma.SlicingSpec(),
        ])

    model_resolver = ResolverNode(
          instance_name='latest_blessed_model_resolver',
          resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
          model=Channel(type=Model),
          model_blessing=Channel(type=ModelBlessing))

    evaluator = Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=conf['beam']['serving_model_dir'])))

    components = [
        example_gen,
        statistics_gen,
        schema_gen,
        transform,
        example_validator,
        trainer,
        model_resolver,
        evaluator,
        pusher
    ]


    tfx_pipeline = pipeline.Pipeline(
        pipeline_name=conf['beam']['pipeline_name'],
        pipeline_root=conf['beam']['pipeline_root_dir'],
        components=components,
        enable_cache=False,
        metadata_connection_config=(
            metadata.sqlite_metadata_connection_config(conf['beam']['metadata_path'])

        )
    )

    return tfx_pipeline