Ejemplo n.º 1
0
 def testReusePredicate(self):
     pred = _FakePredicate('pred')
     with conditional.Cond(pred):
         node1 = _FakeNode().with_id('node1')
     with conditional.Cond(pred):
         node2 = _FakeNode().with_id('node2')
     self.assertEqual(conditional.get_predicates(node1), (pred, ))
     self.assertEqual(conditional.get_predicates(node2), (pred, ))
Ejemplo n.º 2
0
 def testNestedConditionWithDuplicatePredicates(self):
     pred = _FakePredicate('pred')
     with self.assertRaisesRegex(
             ValueError, 'Nested conditionals with duplicate predicates'):
         with conditional.Cond(pred):
             unused_node1 = _FakeNode().with_id('node1')
             with conditional.Cond(pred):
                 unused_node2 = _FakeNode().with_id('node2')
Ejemplo n.º 3
0
 def testNestedCondition(self):
     pred1 = _FakePredicate('pred1')
     pred2 = _FakePredicate('pred2')
     with conditional.Cond(pred1):
         node1 = _FakeNode().with_id('node1')
         with conditional.Cond(pred2):
             node2 = _FakeNode().with_id('node2')
     self.assertEqual(conditional.get_predicates(node1), (pred1, ))
     self.assertEqual(conditional.get_predicates(node2), (pred1, pred2))
Ejemplo n.º 4
0
    def testBuildDummyConsumerWithCondition(self):
        producer_task_1 = test_utils.dummy_producer_component(
            output1=channel_utils.as_channel([standard_artifacts.Model()]),
            param1='value1',
        ).with_id('producer_task_1')
        producer_task_2 = test_utils.dummy_producer_component_2(
            output1=channel_utils.as_channel([standard_artifacts.Model()]),
            param1='value2',
        ).with_id('producer_task_2')
        # This test tests two things:
        # 1. Nested conditions. The condition string of consumer_task should contain
        #    both predicates.
        # 2. Implicit channels. consumer_task only takes producer_task_1's output.
        #    But producer_task_2 is used in condition, hence producer_task_2 should
        #    be added to the dependency of consumer_task.
        # See testdata for detail.
        with conditional.Cond(
                producer_task_1.outputs['output1'].future()[0].uri != 'uri'):
            with conditional.Cond(producer_task_2.outputs['output1'].future()
                                  [0].property('property') == 'value1'):
                consumer_task = test_utils.dummy_consumer_component(
                    input1=producer_task_1.outputs['output1'],
                    param1=1,
                )
        # Need to construct a pipeline to set producer_component_id.
        unused_pipeline = tfx.dsl.Pipeline(
            pipeline_name='pipeline-with-condition',
            pipeline_root='',
            components=[producer_task_1, producer_task_2, consumer_task],
        )
        deployment_config = pipeline_pb2.PipelineDeploymentConfig()
        component_defs = {}
        my_builder = step_builder.StepBuilder(
            node=consumer_task,
            image='gcr.io/tensorflow/tfx:latest',
            deployment_config=deployment_config,
            component_defs=component_defs)
        actual_step_spec = self._sole(my_builder.build())
        actual_component_def = self._sole(component_defs)

        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_dummy_consumer_with_condition_component.pbtxt',
                pipeline_pb2.ComponentSpec()), actual_component_def)
        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_dummy_consumer_with_condition_task.pbtxt',
                pipeline_pb2.PipelineTaskSpec()), actual_step_spec)
        self.assertProtoEquals(
            test_utils.get_proto_from_test_data(
                'expected_dummy_consumer_with_condition_executor.pbtxt',
                pipeline_pb2.PipelineDeploymentConfig()), deployment_config)
Ejemplo n.º 5
0
 def testSingleCondition(self):
   pred = _FakePredicate('pred')
   with conditional.Cond(pred):
     node1 = _FakeNode().with_id('node1')
     node2 = _FakeNode().with_id('node2')
   self.assertSetEqual(conditional.get_predicates(node1), {pred})
   self.assertSetEqual(conditional.get_predicates(node2), {pred})
Ejemplo n.º 6
0
def create_pipeline() -> pipeline_pb2.Pipeline:
  """Builds a test pipeline."""
  # pylint: disable=no-value-for-parameter
  example_gen = _example_gen().with_id('my_example_gen')
  stats_gen = _statistics_gen(
      examples=example_gen.outputs['examples']).with_id('my_statistics_gen')
  schema_gen = _schema_gen(
      statistics=stats_gen.outputs['statistics']).with_id('my_schema_gen')
  example_validator = _example_validator(
      statistics=stats_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema']).with_id('my_example_validator')
  transform = _transform(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema']).with_id('my_transform')
  trainer = _trainer(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      transform_graph=transform.outputs['transform_graph']).with_id(
          'my_trainer')

  # Nodes with no input or output specs for testing task only dependencies.
  chore_a = _chore().with_id('chore_a')
  chore_a.add_upstream_node(trainer)
  chore_b = _chore().with_id('chore_b')
  chore_b.add_upstream_node(chore_a)

  with conditional.Cond(
      trainer.outputs['model'].future()[0].custom_property('evaluate') == 1):
    evaluator = _evaluator(
        model=trainer.outputs['model']).with_id('my_evaluator')
  # pylint: enable=no-value-for-parameter

  pipeline = pipeline_lib.Pipeline(
      pipeline_name='my_pipeline',
      pipeline_root='/path/to/root',
      components=[
          example_gen,
          stats_gen,
          schema_gen,
          example_validator,
          transform,
          trainer,
          evaluator,
          chore_a,
          chore_b,
      ],
      enable_cache=True)
  dsl_compiler = compiler.Compiler()
  return dsl_compiler.compile(pipeline)
Ejemplo n.º 7
0
def _create_pipeline(
    pipeline_name: str,
    pipeline_root: str,
    data_root: str,
    module_file: str,
    accuracy_threshold: float,
    serving_model_dir: str,
    metadata_path: str,
    user_provided_schema_path: Optional[str],
    enable_tuning: bool,
    enable_bulk_inferrer: bool,
    examplegen_input_config: Optional[tfx.proto.Input],
    examplegen_range_config: Optional[tfx.proto.RangeConfig],
    resolver_range_config: Optional[tfx.proto.RangeConfig],
    beam_pipeline_args: List[str],
    # TODO(b/191634100): Always enable transform cache.
    enable_transform_input_cache: bool
) -> tfx.dsl.Pipeline:
    """Implements the penguin pipeline with TFX.

  Args:
    pipeline_name: name of the TFX pipeline being created.
    pipeline_root: root directory of the pipeline.
    data_root: directory containing the penguin data.
    module_file: path to files used in Trainer and Transform components.
    accuracy_threshold: minimum accuracy to push the model.
    serving_model_dir: filepath to write pipeline SavedModel to.
    metadata_path: path to local pipeline ML Metadata store.
    user_provided_schema_path: path to user provided schema file.
    enable_tuning: If True, the hyperparameter tuning through KerasTuner is
      enabled.
    enable_bulk_inferrer: If True, the generated model will be used for a
      batch inference.
    examplegen_input_config: ExampleGen's input_config.
    examplegen_range_config: ExampleGen's range_config.
    resolver_range_config: SpansResolver's range_config. Specify this will
      enable SpansResolver to get a window of ExampleGen's output Spans for
      transform and training.
    beam_pipeline_args: list of beam pipeline options for LocalDAGRunner. Please
      refer to https://beam.apache.org/documentation/runners/direct/.
    enable_transform_input_cache: Indicates whether input cache should be used
      in Transform if available.

  Returns:
    A TFX pipeline object.
  """

    # Brings data into the pipeline or otherwise joins/converts training data.
    example_gen = tfx.components.CsvExampleGen(
        input_base=os.path.join(data_root, 'labelled'),
        input_config=examplegen_input_config,
        range_config=examplegen_range_config)

    # Computes statistics over data for visualization and example validation.
    statistics_gen = tfx.components.StatisticsGen(
        examples=example_gen.outputs['examples'])

    if user_provided_schema_path:
        # Import user-provided schema.
        schema_gen = tfx.components.ImportSchemaGen(
            schema_file=user_provided_schema_path)
        # Performs anomaly detection based on statistics and data schema.
        example_validator = tfx.components.ExampleValidator(
            statistics=statistics_gen.outputs['statistics'],
            schema=schema_gen.outputs['schema'])
    else:
        # Generates schema based on statistics files.
        schema_gen = tfx.components.SchemaGen(
            statistics=statistics_gen.outputs['statistics'],
            infer_feature_shape=True)

    # Gets multiple Spans for transform and training.
    if resolver_range_config:
        examples_resolver = tfx.dsl.Resolver(
            strategy_class=tfx.dsl.experimental.SpanRangeStrategy,
            config={
                'range_config': resolver_range_config
            },
            examples=tfx.dsl.Channel(
                type=tfx.types.standard_artifacts.Examples,
                producer_component_id=example_gen.id)).with_id('span_resolver')

    # Performs transformations and feature engineering in training and serving.
    if enable_transform_input_cache:
        transform_cache_resolver = tfx.dsl.Resolver(
            strategy_class=tfx.dsl.experimental.LatestArtifactStrategy,
            cache=tfx.dsl.Channel(
                type=tfx.types.standard_artifacts.TransformCache)).with_id(
                    'transform_cache_resolver')
        tft_resolved_cache = transform_cache_resolver.outputs['cache']
    else:
        tft_resolved_cache = None

    transform = tfx.components.Transform(
        examples=(examples_resolver.outputs['examples'] if
                  resolver_range_config else example_gen.outputs['examples']),
        schema=schema_gen.outputs['schema'],
        module_file=module_file,
        analyzer_cache=tft_resolved_cache)

    # Tunes the hyperparameters for model training based on user-provided Python
    # function. Note that once the hyperparameters are tuned, you can drop the
    # Tuner component from pipeline and feed Trainer with tuned hyperparameters.
    if enable_tuning:
        tuner = tfx.components.Tuner(
            module_file=module_file,
            examples=transform.outputs['transformed_examples'],
            transform_graph=transform.outputs['transform_graph'],
            train_args=tfx.proto.TrainArgs(num_steps=20),
            eval_args=tfx.proto.EvalArgs(num_steps=5))

    # Uses user-provided Python function that trains a model.
    trainer = tfx.components.Trainer(
        module_file=module_file,
        examples=transform.outputs['transformed_examples'],
        transform_graph=transform.outputs['transform_graph'],
        schema=schema_gen.outputs['schema'],
        # If Tuner is in the pipeline, Trainer can take Tuner's output
        # best_hyperparameters artifact as input and utilize it in the user module
        # code.
        #
        # If there isn't Tuner in the pipeline, either use Importer to import
        # a previous Tuner's output to feed to Trainer, or directly use the tuned
        # hyperparameters in user module code and set hyperparameters to None
        # here.
        #
        # Example of Importer,
        #   hparams_importer = Importer(
        #     source_uri='path/to/best_hyperparameters.txt',
        #     artifact_type=HyperParameters).with_id('import_hparams')
        #   ...
        #   hyperparameters = hparams_importer.outputs['result'],
        hyperparameters=(tuner.outputs['best_hyperparameters']
                         if enable_tuning else None),
        train_args=tfx.proto.TrainArgs(num_steps=100),
        eval_args=tfx.proto.EvalArgs(num_steps=5))

    # Get the latest blessed model for model validation.
    model_resolver = tfx.dsl.Resolver(
        strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy,
        model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),
        model_blessing=tfx.dsl.Channel(
            type=tfx.types.standard_artifacts.ModelBlessing)).with_id(
                'latest_blessed_model_resolver')

    # Uses TFMA to compute evaluation statistics over features of a model and
    # perform quality validation of a candidate model (compared to a baseline).
    eval_config = tfma.EvalConfig(
        model_specs=[
            tfma.ModelSpec(signature_name='serving_default',
                           label_key='species_xf',
                           preprocessing_function_names=['transform_features'])
        ],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(metrics=[
                tfma.MetricConfig(
                    class_name='SparseCategoricalAccuracy',
                    threshold=tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': accuracy_threshold}),
                        # Change threshold will be ignored if there is no
                        # baseline model resolved from MLMD (first run).
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10})))
            ])
        ])
    evaluator = tfx.components.Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    # Components declared within the conditional block will only be triggered
    # if the Predicate evaluates to True.
    #
    # In the example below,
    # evaluator.outputs['blessing'].future()[0].custom_property('blessed') == 1
    # is a Predicate, which will be evaluated during runtime.
    #
    # - evaluator.outputs['blessing'] is the output Channel 'blessing'.
    # - .future() turns the Channel into a Placeholder.
    # - [0] gets the first artifact from the 'blessing' Channel.
    # - .custom_property('blessed') gets a custom property called 'blessed' from
    #   that artifact.
    # - == 1 compares that property with 1. (An explicit comparison is needed.
    #   There's no automatic boolean conversion based on truthiness.)
    #
    # Note these operations are just placeholder, something like Mocks. They are
    # not evaluated until runtime. For more details, see tfx/dsl/placeholder/.
    with conditional.Cond(evaluator.outputs['blessing'].future()
                          [0].custom_property('blessed') == 1):
        # Checks whether the model passed the validation steps and pushes the model
        # to a file destination if check passed.
        pusher = tfx.components.Pusher(
            model=trainer.outputs['model'],
            # No need to pass model_blessing any more, since Pusher is already
            # guarded by a Conditional.
            # model_blessing=evaluator.outputs['blessing'],
            push_destination=tfx.proto.PushDestination(
                filesystem=tfx.proto.PushDestination.Filesystem(
                    base_directory=serving_model_dir)))

    # Showcase for BulkInferrer component.
    if enable_bulk_inferrer:
        # Generates unlabelled examples.
        example_gen_unlabelled = tfx.components.CsvExampleGen(
            input_base=os.path.join(data_root, 'unlabelled')).with_id(
                'CsvExampleGen_Unlabelled')

        # Performs offline batch inference.
        bulk_inferrer = tfx.components.BulkInferrer(
            examples=example_gen_unlabelled.outputs['examples'],
            model=trainer.outputs['model'],
            # Empty data_spec.example_splits will result in using all splits.
            data_spec=tfx.proto.DataSpec(),
            model_spec=tfx.proto.ModelSpec())

    components_list = [
        example_gen,
        statistics_gen,
        schema_gen,
        transform,
        trainer,
        model_resolver,
        evaluator,
        pusher,
    ]
    if resolver_range_config:
        components_list.append(examples_resolver)
    if enable_transform_input_cache:
        components_list.append(transform_cache_resolver)
    if enable_tuning:
        components_list.append(tuner)
    if enable_bulk_inferrer:
        components_list.append(example_gen_unlabelled)
        components_list.append(bulk_inferrer)
    if user_provided_schema_path:
        components_list.append(example_validator)

    return tfx.dsl.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=components_list,
        enable_cache=True,
        metadata_connection_config=tfx.orchestration.metadata.
        sqlite_metadata_connection_config(metadata_path),
        beam_pipeline_args=beam_pipeline_args)
Ejemplo n.º 8
0
def create_pipeline_components(
    pipeline_root: str,
    transform_module: str,
    trainer_module: str,
    bigquery_query: str = '',
    csv_input_location: str = '',
) -> List[base_node.BaseNode]:
    """Creates components for a simple Chicago Taxi TFX pipeline for testing.

  Args:
    pipeline_root: The root of the pipeline output.
    transform_module: The location of the transform module file.
    trainer_module: The location of the trainer module file.
    bigquery_query: The query to get input data from BigQuery. If not empty,
      BigQueryExampleGen will be used.
    csv_input_location: The location of the input data directory.

  Returns:
    A list of TFX components that constitutes an end-to-end test pipeline.
  """

    if bool(bigquery_query) == bool(csv_input_location):
        raise ValueError(
            'Exactly one example gen is expected. ',
            'Please provide either bigquery_query or csv_input_location.')

    if bigquery_query:
        example_gen = tfx.extensions.google_cloud_big_query.BigQueryExampleGen(
            query=bigquery_query)
    else:
        example_gen = tfx.components.CsvExampleGen(
            input_base=csv_input_location)

    statistics_gen = tfx.components.StatisticsGen(
        examples=example_gen.outputs['examples'])
    schema_gen = tfx.components.SchemaGen(
        statistics=statistics_gen.outputs['statistics'],
        infer_feature_shape=False)
    example_validator = tfx.components.ExampleValidator(
        statistics=statistics_gen.outputs['statistics'],
        schema=schema_gen.outputs['schema'])
    transform = tfx.components.Transform(
        examples=example_gen.outputs['examples'],
        schema=schema_gen.outputs['schema'],
        module_file=transform_module)
    latest_model_resolver = tfx.dsl.Resolver(
        strategy_class=tfx.dsl.experimental.LatestArtifactStrategy,
        model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model
                              )).with_id('Resolver.latest_model_resolver')
    trainer = tfx.components.Trainer(
        custom_executor_spec=executor_spec.ExecutorClassSpec(Executor),
        examples=transform.outputs['transformed_examples'],
        schema=schema_gen.outputs['schema'],
        base_model=latest_model_resolver.outputs['model'],
        transform_graph=transform.outputs['transform_graph'],
        train_args=tfx.proto.TrainArgs(num_steps=10),
        eval_args=tfx.proto.EvalArgs(num_steps=5),
        module_file=trainer_module,
    )
    # Get the latest blessed model for model validation.
    model_resolver = tfx.dsl.Resolver(
        strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy,
        model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model),
        model_blessing=tfx.dsl.Channel(
            type=tfx.types.standard_artifacts.ModelBlessing)).with_id(
                'Resolver.latest_blessed_model_resolver')
    # Set the TFMA config for Model Evaluation and Validation.
    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name='eval')],
        metrics_specs=[
            tfma.MetricsSpec(
                metrics=[tfma.MetricConfig(class_name='ExampleCount')],
                thresholds={
                    'binary_accuracy':
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={'value': 0.5}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={'value': -1e-10}))
                })
        ],
        slicing_specs=[
            tfma.SlicingSpec(),
            tfma.SlicingSpec(feature_keys=['trip_start_hour'])
        ])
    evaluator = tfx.components.Evaluator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'],
        baseline_model=model_resolver.outputs['model'],
        eval_config=eval_config)

    with conditional.Cond(evaluator.outputs['blessing'].future()
                          [0].custom_property('blessed') == 1):
        pusher = tfx.components.Pusher(
            model=trainer.outputs['model'],
            push_destination=tfx.proto.
            PushDestination(filesystem=tfx.proto.PushDestination.Filesystem(
                base_directory=os.path.join(pipeline_root, 'model_serving'))))

    return [
        example_gen, statistics_gen, schema_gen, example_validator, transform,
        latest_model_resolver, trainer, model_resolver, evaluator, pusher
    ]
Ejemplo n.º 9
0
def create_test_pipeline():
    """Builds a conditional pipeline."""
    pipeline_name = "cond"
    cond_root = "cond_root"
    serving_model_dir = os.path.join(cond_root, "serving_model", pipeline_name)
    tfx_root = "tfx_root"
    data_path = os.path.join(tfx_root, "data_path")
    pipeline_root = os.path.join(tfx_root, "pipelines", pipeline_name)

    example_gen = CsvExampleGen(input_base=data_path)
    statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"])
    schema_gen = SchemaGen(statistics=statistics_gen.outputs["statistics"])

    trainer = Trainer(
        module_file="module_file",
        custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor),
        examples=example_gen.outputs["examples"],
        schema=schema_gen.outputs["schema"],
        train_args=trainer_pb2.TrainArgs(num_steps=2000),
        eval_args=trainer_pb2.EvalArgs(num_steps=5))

    eval_config = tfma.EvalConfig(
        model_specs=[tfma.ModelSpec(signature_name="eval")],
        slicing_specs=[tfma.SlicingSpec()],
        metrics_specs=[
            tfma.MetricsSpec(
                thresholds={
                    "sparse_categorical_accuracy":
                    tfma.MetricThreshold(
                        value_threshold=tfma.GenericValueThreshold(
                            lower_bound={"value": 0.6}),
                        change_threshold=tfma.GenericChangeThreshold(
                            direction=tfma.MetricDirection.HIGHER_IS_BETTER,
                            absolute={"value": -1e-10}))
                })
        ])

    evaluator = Evaluator(examples=example_gen.outputs["examples"],
                          model=trainer.outputs["model"],
                          eval_config=eval_config)

    with conditional.Cond(
            evaluator.outputs["blessing"].future()[0].value == 1):
        infra_validator = InfraValidator(
            model=trainer.outputs["model"],
            examples=example_gen.outputs["examples"],
            serving_spec=infra_validator_pb2.ServingSpec(
                tensorflow_serving=infra_validator_pb2.TensorFlowServing(
                    tags=["latest"]),
                local_docker=infra_validator_pb2.LocalDockerConfig()),
            request_spec=infra_validator_pb2.RequestSpec(
                tensorflow_serving=infra_validator_pb2.
                TensorFlowServingRequestSpec()))
        with conditional.Cond(
                ph.logical_and(
                    infra_validator.outputs["blessing"].future()[0].value == 1,
                    trainer.outputs["model"].future()[0].uri != "")):  # pylint: disable=g-explicit-bool-comparison
            pusher = Pusher(
                model=trainer.outputs["model"],
                push_destination=pusher_pb2.PushDestination(
                    filesystem=pusher_pb2.PushDestination.Filesystem(
                        base_directory=serving_model_dir)))

    return pipeline.Pipeline(
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        components=[
            example_gen, statistics_gen, schema_gen, trainer, evaluator,
            infra_validator, pusher
        ],
        enable_cache=True,
        beam_pipeline_args=["--my_testing_beam_pipeline_args=foo"],
        execution_mode=pipeline.ExecutionMode.SYNC)