def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines."""

  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'])

  return pipeline.Pipeline(
      pipeline_name='chicago_taxi_pipeline_kubeflow',
      pipeline_root=_pipeline_root,
      components=[example_gen, statistics_gen, infer_schema],
      additional_pipeline_args={
          'beam_pipeline_args': [
              '--runner=DataflowRunner',
              '--experiments=shuffle_mode=auto',
              '--project=' + _project_id,
              '--temp_location=' + os.path.join(_output_dir, 'tmp'),
              '--region=' + _gcp_region,
          ],
      },
      log_root='/var/tmp/tfx/logs',
  )
Beispiel #2
0
def create_pipeline():

    # Read data in; can split data here
    examples = csv_input(DATA_DIR)
    example_gen = CsvExampleGen(input_base=examples, name='iris_example')

    # Generate feature statistics
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Infer schema for data
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Identify  anomomalies in training and serving data
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs feature engineering; emits a SavedModel that does preprocessing
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=TRANSFORM_MODULE_FILE)

    # Trains a model
    trainer = Trainer(
        module_file=MODEL_MODULE_FILE,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Evaluates the model on  different slices of the data (bias detection?!)
    model_analyzer = Evaluator(examples=example_gen.outputs.examples,
                               model_exports=trainer.outputs.output)

    # Compares new model against a baseline; both models evaluated on a dataset
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Pushes a blessed model to a deployment target (tfserving)
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=SERVING_DIR)))

    return pipeline.Pipeline(pipeline_name=PIPELINE_NAME,
                             pipeline_root=DAGS_DIR,
                             components=[
                                 example_gen, statistics_gen, infer_schema,
                                 validate_stats, transform, trainer,
                                 model_analyzer, model_validator, pusher
                             ],
                             enable_cache=True,
                             metadata_db_root=METADATA_DIR,
                             additional_pipeline_args={
                                 'logger_args': {
                                     'log_root': LOGS_DIR,
                                     'log_level': logging.INFO
                                 }
                             })
Beispiel #3
0
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'])

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=infer_schema.outputs['schema'])

    return pipeline.Pipeline(
        pipeline_name='chicago_taxi_simple',
        pipeline_root=_pipeline_root,
        components=[example_gen, statistics_gen, infer_schema, validate_stats],
        enable_cache=True,
        metadata_db_root=_metadata_db_root,
    )
Beispiel #4
0
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text,
                          taxi_module_file: Text, output_bucket: Text,
                          enable_cache: bool):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """

    examples = csv_input(csv_input_location)

    example_gen = CsvExampleGen(input_base=examples)
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output,
                             infer_feature_shape=False)
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=taxi_module_file)
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)
    pusher = Pusher(
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(output_bucket, 'model_serving'))))

    return pipeline.Pipeline(
        pipeline_name='chicago_taxi_pipeline_simple',
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        enable_cache=enable_cache,
    )
Beispiel #5
0
def create_e2e_components(
    pipeline_root: Text,
    csv_input_location: Text,
    transform_module: Text,
    trainer_module: Text,
) -> List[BaseComponent]:
  """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """
  examples = dsl_utils.csv_input(csv_input_location)

  example_gen = CsvExampleGen(input=examples)
  statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
  infer_schema = SchemaGen(
      statistics=statistics_gen.outputs['statistics'],
      infer_feature_shape=False)
  validate_stats = ExampleValidator(
      statistics=statistics_gen.outputs['statistics'],
      schema=infer_schema.outputs['schema'])
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=transform_module)
  trainer = Trainer(
      transformed_examples=transform.outputs['transformed_examples'],
      schema=infer_schema.outputs['schema'],
      transform_graph=transform.outputs['transform_graph'],
      train_args=trainer_pb2.TrainArgs(num_steps=10),
      eval_args=trainer_pb2.EvalArgs(num_steps=5),
      module_file=trainer_module,
  )
  model_analyzer = Evaluator(
      examples=example_gen.outputs['examples'],
      model=trainer.outputs['model'],
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))
  model_validator = ModelValidator(
      examples=example_gen.outputs['examples'], model=trainer.outputs['model'])
  pusher = Pusher(
      model=trainer.outputs['model'],
      model_blessing=model_validator.outputs['blessing'],
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=os.path.join(pipeline_root, 'model_serving'))))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, pusher
  ]
 def testCsvExampleGenOnDataflowRunner(self):
   """CsvExampleGen-only test pipeline on DataflowRunner invocation."""
   pipeline_name = 'kubeflow-csv-example-gen-dataflow-test-{}'.format(
       self._random_id())
   pipeline = self._create_dataflow_pipeline(pipeline_name, [
       CsvExampleGen(input=dsl_utils.csv_input(self._data_root)),
   ])
   self._compile_and_run_pipeline(pipeline)
def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=_taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=_serving_model_dir)))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, pusher
  ]
def create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    #   statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3

    # Generates schema based on statistics files.
    #   schema_gen = SchemaGen(stats=statistics_gen.outputs.output) # Step 3

    # Performs anomaly detection based on statistics and data schema.
    #   validate_stats = ExampleValidator( # Step 3
    #       stats=statistics_gen.outputs.output, # Step 3
    #       schema=schema_gen.outputs.output) # Step 3

    # Performs transformations and feature engineering in training and serving.
    #   transform = Transform( # Step 4
    #       input_data=example_gen.outputs.examples, # Step 4
    #       schema=schema_gen.outputs.output, # Step 4
    #       module_file=taxi_module_file) # Step 4

    # Uses user-provided Python function that implements a model using TF-Learn.
    #   trainer = Trainer( # Step 5
    #       module_file=taxi_module_file, # Step 5
    #       transformed_examples=transform.outputs.transformed_examples, # Step 5
    #       schema=schema_gen.outputs.output, # Step 5
    #       transform_output=transform.outputs.transform_output, # Step 5
    #       train_steps=10000, # Step 5
    #       eval_steps=5000, # Step 5
    #       warm_starting=True) # Step 5

    # Uses TFMA to compute a evaluation statistics over features of a model.
    #   model_analyzer = Evaluator( # Step 6
    #       examples=example_gen.outputs.examples, # Step 6
    #       model_exports=trainer.outputs.output) # Step 6

    # Performs quality validation of a candidate model (compared to a baseline).
    #   model_validator = ModelValidator( # Step 7
    #       examples=example_gen.outputs.examples, model=trainer.outputs.output) # Step 7

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    #   pusher = Pusher( # Step 7
    #       model_export=trainer.outputs.output, # Step 7
    #       model_blessing=model_validator.outputs.blessing, # Step 7
    #       serving_model_dir=serving_model_dir) # Step 7

    return [
        example_gen,
        #       statistics_gen, schema_gen, validate_stats, # Step 3
        #       transform, # Step 4
        #       trainer, # Step 5
        #       model_analyzer, # Step 6
        #       model_validator, pusher # Step 7
    ]
def create_pipeline():
    """Implements the titanic taxi pipeline with TFX."""

    examples = csv_input(data_dir)

    # Brings data into the pipeline
    example_gen = CsvExampleGen(input_base=examples)

    return [example_gen]
def create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(os.path.join(data_root, 'simple'))

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output,
      schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_steps=10000,
      eval_steps=5000,
      warm_starting=True)

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output)

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      serving_model_dir=serving_model_dir)

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, pusher
  ]
Beispiel #11
0
 def __init__(self, base_dir, csvname):
     self.base_dir = base_dir
     self.csvname = csvname
     self.components = []
     examples = csv_input(os.path.join(self.base_dir, self.csvname))
     self.example_gen = tfx.components.example_gen.csv_example_gen.component.CsvExampleGen(
         input=examples)
     self.statistics_gen = StatisticsGen(
         self.example_gen.outputs['examples'],
         instance_name=self.csvname + "_statistics_gen")
     self.scheme_gen = SchemaGen(
         statistics=self.statistics_gen.outputs['statistics'])
     self.valid_stats = ExampleValidator(
         statistics=self.statistics_gen.outputs["statistics"],
         schema=self.scheme_gen.outputs["schema"])
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    return pipeline.Pipeline(
        pipeline_name='chicago_taxi_simple',
        pipeline_root=_pipeline_root,
        components=[example_gen, statistics_gen, infer_schema],
        enable_cache=True,
        metadata_db_root=_metadata_db_root,
    )
Beispiel #13
0
 def csv_input(self):
     [csv] = dsl_utils.csv_input(uri='path')
     self.assertEqual('ExternalPath', csv.type_name)
     self.assertEqual('path', csv.uri)
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

    # Generates schema based on statistics files.
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

    # Performs anomaly detection based on statistics and data schema.
    validate_stats = ExampleValidator(stats=statistics_gen.outputs.output,
                                      schema=infer_schema.outputs.output)

    # Performs transformations and feature engineering in training and serving.
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=_taxi_module_file)

    # Uses user-provided Python function that implements a model using TF-Learn.
    trainer = Trainer(
        module_file=_taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))

    # Uses TFMA to compute a evaluation statistics over features of a model.
    model_analyzer = Evaluator(
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))

    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    pusher = Pusher(model_export=trainer.outputs.output,
                    model_blessing=model_validator.outputs.blessing,
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=_serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=_pipeline_name,
        pipeline_root=_pipeline_root,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            _metadata_db_root),
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        additional_pipeline_args={
            # LINT.IfChange
            'beam_pipeline_args': [
                # ----- Beam Args -----.
                '--runner=PortableRunner',
                # Points to the job server started in
                # setup_beam_on_(flink|spark).sh
                '--job_endpoint=localhost:8099',
                '--environment_type=LOOPBACK',
                # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment.  # pylint: disable=g-bad-todo
                # TODO(BEAM-5167): Use concurrency information from SDK Harness.  # pylint: disable=g-bad-todo
                # Note; We use 100 worker threads to mitigate the issue with
                # scheduling work between the Beam runner and SDK harness. Flink
                # and Spark can process unlimited work items concurrently while
                # SdkHarness can only process 1 work item per worker thread.
                # Having 100 threads will let 100 tasks execute concurrently
                # avoiding scheduling issue in most cases. In case the threads are
                # exhausted, beam print the relevant message in the log.
                '--experiments=worker_threads=100',
                # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all.  # pylint: disable=g-bad-todo
                '--experiments=pre_optimize=all',
                # ----- Flink runner-specific Args -----.
                # TODO(b/126725506): Set the task parallelism based on cpu cores.
                # TODO(FLINK-10672): Obviate setting BATCH_FORCED.
                '--execution_mode_for_batch=BATCH_FORCED',
            ],
            # LINT.ThenChange(tfx/examples/chicago_taxi/setup_beam_on_portable_beam.sh)
        },
    )
Beispiel #15
0
def _create_pipeline():
  """Implements the chicago taxi pipeline with TFX."""
  examples = csv_input(_data_root)

  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = CsvExampleGen(input_base=examples)

  # Computes statistics over data for visualization and example validation.
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)

  # Generates schema based on statistics files.
  infer_schema = SchemaGen(stats=statistics_gen.outputs.output)

  # Performs anomaly detection based on statistics and data schema.
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)

  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=_taxi_module_file)

  # Uses user-provided Python function that implements a model using TF-Learn.
  trainer = Trainer(
      module_file=_taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10000),
      eval_args=trainer_pb2.EvalArgs(num_steps=5000))

  # Uses TFMA to compute a evaluation statistics over features of a model.
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
          evaluator_pb2.SingleSlicingSpec(
              column_for_slicing=['trip_start_hour'])
      ]))

  # Performs quality validation of a candidate model (compared to a baseline).
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # This custom component serves as a bridge between pipeline and human model
  # reviewers to enable review-and-push workflow in model development cycle. It
  # utilizes Slack API to send message to user-defined Slack channel with model
  # URI info and wait for go / no-go decision from the same Slack channel:
  #   * To approve the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'lgtm' or 'approve'.
  #   * To reject the model, users need to reply the thread sent out by the bot
  #     started by SlackComponent with 'decline' or 'reject'.
  slack_validator = SlackComponent(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      slack_token=_slack_token,
      channel_id=_channel_id,
      timeout_sec=3600,
  )

  # Checks whether the model passed the validation steps and pushes the model
  # to a file destination if check passed.
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=slack_validator.outputs.slack_blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=_serving_model_dir)))

  return [
      example_gen, statistics_gen, infer_schema, validate_stats, transform,
      trainer, model_analyzer, model_validator, slack_validator, pusher
  ]
Beispiel #16
0
def _create_pipeline():
    """Implements the chicago taxi pipeline with TFX."""
    examples = csv_input(_data_root)

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = CsvExampleGen(input_base=examples)

    # Computes statistics over data for visualization and example validation.
    # pylint: disable=line-too-long
    # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3
    # pylint: enable=line-too-long

    # Generates schema based on statistics files.
    # infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3

    # Performs anomaly detection based on statistics and data schema.
    # validate_stats = ExampleValidator( # Step 3
    #     stats=statistics_gen.outputs.output, # Step 3
    #           schema=infer_schema.outputs.output) # Step 3

    # Performs transformations and feature engineering in training and serving.
    # transform = Transform( # Step 4
    #     input_data=example_gen.outputs.examples, # Step 4
    #     schema=infer_schema.outputs.output, # Step 4
    #     module_file=_taxi_module_file) # Step 4

    # Uses user-provided Python function that implements a model using TF-Learn.
    # trainer = Trainer( # Step 5
    #     module_file=_taxi_module_file, # Step 5
    #     transformed_examples=transform.outputs.transformed_examples, # Step 5
    #     schema=infer_schema.outputs.output, # Step 5
    #     transform_output=transform.outputs.transform_output, # Step 5
    #     train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5
    #     eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5

    # Uses TFMA to compute a evaluation statistics over features of a model.
    # model_analyzer = Evaluator( # Step 6
    #     examples=example_gen.outputs.examples, # Step 6
    #     model_exports=trainer.outputs.output, # Step 6
    #     feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6
    #         evaluator_pb2.SingleSlicingSpec( # Step 6
    #             column_for_slicing=['trip_start_hour']) # Step 6
    #     ])) # Step 6

    # Performs quality validation of a candidate model (compared to a baseline).
    # model_validator = ModelValidator( # Step 7
    #     examples=example_gen.outputs.examples, # Step 7
    #              model=trainer.outputs.output) # Step 7

    # Checks whether the model passed the validation steps and pushes the model
    # to a file destination if check passed.
    # pusher = Pusher( # Step 7
    #     model_export=trainer.outputs.output, # Step 7
    #     model_blessing=model_validator.outputs.blessing, # Step 7
    #     push_destination=pusher_pb2.PushDestination( # Step 7
    #         filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7
    #             base_directory=_serving_model_dir))) # Step 7

    return pipeline.Pipeline(
        pipeline_name='taxi',
        pipeline_root=_pipeline_root,
        components=[
            example_gen,
            # statistics_gen, infer_schema, validate_stats, # Step 3
            # transform, # Step 4
            # trainer, # Step 5
            # model_analyzer, # Step 6
            # model_validator, pusher # Step 7
        ],
        enable_cache=True,
        metadata_db_root=_metadata_db_root,
        additional_pipeline_args={'logger_args': logger_overrides},
    )
Beispiel #17
0
def create_e2e_components(
    pipeline_root: Text,
    csv_input_location: Text,
    transform_module: Text,
    trainer_module: Text,
) -> List[BaseComponent]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """
    examples = dsl_utils.csv_input(csv_input_location)

    example_gen = CsvExampleGen(input=examples)
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'],
                           infer_feature_shape=False)
    example_validator = ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = Transform(examples=example_gen.outputs['examples'],
                          schema=schema_gen.outputs['schema'],
                          module_file=transform_module)
    latest_model_resolver = ResolverNode(
        instance_name='latest_model_resolver',
        resolver_class=latest_artifacts_resolver.LatestArtifactsResolver,
        latest_model=Channel(type=Model))
    trainer = Trainer(
        transformed_examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['latest_model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=trainer_pb2.TrainArgs(num_steps=10),
        eval_args=trainer_pb2.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = Evaluator(examples=example_gen.outputs['examples'],
                          model=trainer.outputs['model'],
                          eval_config=eval_config)

    infra_validator = InfraValidator(
        model=trainer.outputs['model'],
        examples=example_gen.outputs['examples'],
        serving_spec=infra_validator_pb2.ServingSpec(
            tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                tags=['latest']),
            kubernetes=infra_validator_pb2.KubernetesConfig()),
        request_spec=infra_validator_pb2.RequestSpec(
            tensorflow_serving=infra_validator_pb2.
            TensorFlowServingRequestSpec()))

    pusher = Pusher(
        model=trainer.outputs['model'],
        model_blessing=evaluator.outputs['blessing'],
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen,
        statistics_gen,
        schema_gen,
        example_validator,
        transform,
        latest_model_resolver,
        trainer,
        evaluator,
        infra_validator,
        pusher,
    ]
Beispiel #18
0
def _create_test_pipeline(pipeline_name: Text, pipeline_root: Text,
                          csv_input_location: Text, taxi_module_file: Text,
                          container_image: Text):
    """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    container_image: The container image to use.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
    examples = dsl_utils.csv_input(csv_input_location)

    example_gen = CsvExampleGen(input_base=examples)
    statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
    infer_schema = SchemaGen(stats=statistics_gen.outputs.output)
    validate_stats = ExampleValidator(  # pylint: disable=unused-variable
        stats=statistics_gen.outputs.output,
        schema=infer_schema.outputs.output)
    transform = Transform(input_data=example_gen.outputs.examples,
                          schema=infer_schema.outputs.output,
                          module_file=taxi_module_file)
    trainer = Trainer(
        module_file=taxi_module_file,
        transformed_examples=transform.outputs.transformed_examples,
        schema=infer_schema.outputs.output,
        transform_output=transform.outputs.transform_output,
        train_args=trainer_pb2.TrainArgs(num_steps=10000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5000))
    model_analyzer = Evaluator(  # pylint: disable=unused-variable
        examples=example_gen.outputs.examples,
        model_exports=trainer.outputs.output,
        feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
            evaluator_pb2.SingleSlicingSpec(
                column_for_slicing=['trip_start_hour'])
        ]))
    model_validator = ModelValidator(examples=example_gen.outputs.examples,
                                     model=trainer.outputs.output)
    pusher = Pusher(  # pylint: disable=unused-variable
        model_export=trainer.outputs.output,
        model_blessing=model_validator.outputs.blessing,
        push_destination=pusher_pb2.PushDestination(
            filesystem=pusher_pb2.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return tfx_pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, infer_schema, validate_stats,
            transform, trainer, model_analyzer, model_validator, pusher
        ],
        log_root='/var/tmp/tfx/logs',
        additional_pipeline_args={
            'tfx_image': container_image,
        },
    )
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text,
    taxi_module_file: Text, enable_cache: bool):
  """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline.

  Args:
    pipeline_name: The name of the pipeline.
    pipeline_root: The root of the pipeline output.
    csv_input_location: The location of the input data directory.
    taxi_module_file: The location of the module file for Transform/Trainer.
    enable_cache: Whether to enable cache or not.

  Returns:
    A logical TFX pipeline.Pipeline object.
  """
  examples = csv_input(csv_input_location)

  example_gen = CsvExampleGen(input_base=examples)
  statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples)
  infer_schema = SchemaGen(
      stats=statistics_gen.outputs.output, infer_feature_shape=False)
  validate_stats = ExampleValidator(
      stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output)
  transform = Transform(
      input_data=example_gen.outputs.examples,
      schema=infer_schema.outputs.output,
      module_file=taxi_module_file)
  trainer = Trainer(
      module_file=taxi_module_file,
      transformed_examples=transform.outputs.transformed_examples,
      schema=infer_schema.outputs.output,
      transform_output=transform.outputs.transform_output,
      train_args=trainer_pb2.TrainArgs(num_steps=10),
      eval_args=trainer_pb2.EvalArgs(num_steps=5))
  model_analyzer = Evaluator(
      examples=example_gen.outputs.examples,
      model_exports=trainer.outputs.output,
      feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
        evaluator_pb2.SingleSlicingSpec(
            column_for_slicing=['trip_start_hour'])
      ]))
  model_validator = ModelValidator(
      examples=example_gen.outputs.examples, model=trainer.outputs.output)

  # Hack: ensuring push_destination can be correctly parameterized and interpreted.
  # pipeline root will be specified as a dsl.PipelineParam with the name
  # pipeline-root, see:
  # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226
  _pipeline_root_param = dsl.PipelineParam(name='pipeline-root')
  pusher = Pusher(
      model_export=trainer.outputs.output,
      model_blessing=model_validator.outputs.blessing,
      push_destination=pusher_pb2.PushDestination(
          filesystem=pusher_pb2.PushDestination.Filesystem(
              base_directory=os.path.join(str(_pipeline_root_param), 'model_serving'))))

  return pipeline.Pipeline(
      pipeline_name='parameterized_tfx_oss',
      pipeline_root=pipeline_root,
      components=[
        example_gen, statistics_gen, infer_schema, validate_stats, transform,
        trainer, model_analyzer, model_validator, pusher
      ],
      enable_cache=enable_cache,
  )