def testReusePredicate(self): pred = _FakePredicate('pred') with conditional.Cond(pred): node1 = _FakeNode().with_id('node1') with conditional.Cond(pred): node2 = _FakeNode().with_id('node2') self.assertEqual(conditional.get_predicates(node1), (pred, )) self.assertEqual(conditional.get_predicates(node2), (pred, ))
def testNestedConditionWithDuplicatePredicates(self): pred = _FakePredicate('pred') with self.assertRaisesRegex( ValueError, 'Nested conditionals with duplicate predicates'): with conditional.Cond(pred): unused_node1 = _FakeNode().with_id('node1') with conditional.Cond(pred): unused_node2 = _FakeNode().with_id('node2')
def testNestedCondition(self): pred1 = _FakePredicate('pred1') pred2 = _FakePredicate('pred2') with conditional.Cond(pred1): node1 = _FakeNode().with_id('node1') with conditional.Cond(pred2): node2 = _FakeNode().with_id('node2') self.assertEqual(conditional.get_predicates(node1), (pred1, )) self.assertEqual(conditional.get_predicates(node2), (pred1, pred2))
def testBuildDummyConsumerWithCondition(self): producer_task_1 = test_utils.dummy_producer_component( output1=channel_utils.as_channel([standard_artifacts.Model()]), param1='value1', ).with_id('producer_task_1') producer_task_2 = test_utils.dummy_producer_component_2( output1=channel_utils.as_channel([standard_artifacts.Model()]), param1='value2', ).with_id('producer_task_2') # This test tests two things: # 1. Nested conditions. The condition string of consumer_task should contain # both predicates. # 2. Implicit channels. consumer_task only takes producer_task_1's output. # But producer_task_2 is used in condition, hence producer_task_2 should # be added to the dependency of consumer_task. # See testdata for detail. with conditional.Cond( producer_task_1.outputs['output1'].future()[0].uri != 'uri'): with conditional.Cond(producer_task_2.outputs['output1'].future() [0].property('property') == 'value1'): consumer_task = test_utils.dummy_consumer_component( input1=producer_task_1.outputs['output1'], param1=1, ) # Need to construct a pipeline to set producer_component_id. unused_pipeline = tfx.dsl.Pipeline( pipeline_name='pipeline-with-condition', pipeline_root='', components=[producer_task_1, producer_task_2, consumer_task], ) deployment_config = pipeline_pb2.PipelineDeploymentConfig() component_defs = {} my_builder = step_builder.StepBuilder( node=consumer_task, image='gcr.io/tensorflow/tfx:latest', deployment_config=deployment_config, component_defs=component_defs) actual_step_spec = self._sole(my_builder.build()) actual_component_def = self._sole(component_defs) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_dummy_consumer_with_condition_component.pbtxt', pipeline_pb2.ComponentSpec()), actual_component_def) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_dummy_consumer_with_condition_task.pbtxt', pipeline_pb2.PipelineTaskSpec()), actual_step_spec) self.assertProtoEquals( test_utils.get_proto_from_test_data( 'expected_dummy_consumer_with_condition_executor.pbtxt', pipeline_pb2.PipelineDeploymentConfig()), deployment_config)
def testSingleCondition(self): pred = _FakePredicate('pred') with conditional.Cond(pred): node1 = _FakeNode().with_id('node1') node2 = _FakeNode().with_id('node2') self.assertSetEqual(conditional.get_predicates(node1), {pred}) self.assertSetEqual(conditional.get_predicates(node2), {pred})
def create_pipeline() -> pipeline_pb2.Pipeline: """Builds a test pipeline.""" # pylint: disable=no-value-for-parameter example_gen = _example_gen().with_id('my_example_gen') stats_gen = _statistics_gen( examples=example_gen.outputs['examples']).with_id('my_statistics_gen') schema_gen = _schema_gen( statistics=stats_gen.outputs['statistics']).with_id('my_schema_gen') example_validator = _example_validator( statistics=stats_gen.outputs['statistics'], schema=schema_gen.outputs['schema']).with_id('my_example_validator') transform = _transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema']).with_id('my_transform') trainer = _trainer( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph']).with_id( 'my_trainer') # Nodes with no input or output specs for testing task only dependencies. chore_a = _chore().with_id('chore_a') chore_a.add_upstream_node(trainer) chore_b = _chore().with_id('chore_b') chore_b.add_upstream_node(chore_a) with conditional.Cond( trainer.outputs['model'].future()[0].custom_property('evaluate') == 1): evaluator = _evaluator( model=trainer.outputs['model']).with_id('my_evaluator') # pylint: enable=no-value-for-parameter pipeline = pipeline_lib.Pipeline( pipeline_name='my_pipeline', pipeline_root='/path/to/root', components=[ example_gen, stats_gen, schema_gen, example_validator, transform, trainer, evaluator, chore_a, chore_b, ], enable_cache=True) dsl_compiler = compiler.Compiler() return dsl_compiler.compile(pipeline)
def _create_pipeline( pipeline_name: str, pipeline_root: str, data_root: str, module_file: str, accuracy_threshold: float, serving_model_dir: str, metadata_path: str, user_provided_schema_path: Optional[str], enable_tuning: bool, enable_bulk_inferrer: bool, examplegen_input_config: Optional[tfx.proto.Input], examplegen_range_config: Optional[tfx.proto.RangeConfig], resolver_range_config: Optional[tfx.proto.RangeConfig], beam_pipeline_args: List[str], # TODO(b/191634100): Always enable transform cache. enable_transform_input_cache: bool ) -> tfx.dsl.Pipeline: """Implements the penguin pipeline with TFX. Args: pipeline_name: name of the TFX pipeline being created. pipeline_root: root directory of the pipeline. data_root: directory containing the penguin data. module_file: path to files used in Trainer and Transform components. accuracy_threshold: minimum accuracy to push the model. serving_model_dir: filepath to write pipeline SavedModel to. metadata_path: path to local pipeline ML Metadata store. user_provided_schema_path: path to user provided schema file. enable_tuning: If True, the hyperparameter tuning through KerasTuner is enabled. enable_bulk_inferrer: If True, the generated model will be used for a batch inference. examplegen_input_config: ExampleGen's input_config. examplegen_range_config: ExampleGen's range_config. resolver_range_config: SpansResolver's range_config. Specify this will enable SpansResolver to get a window of ExampleGen's output Spans for transform and training. beam_pipeline_args: list of beam pipeline options for LocalDAGRunner. Please refer to https://beam.apache.org/documentation/runners/direct/. enable_transform_input_cache: Indicates whether input cache should be used in Transform if available. Returns: A TFX pipeline object. """ # Brings data into the pipeline or otherwise joins/converts training data. example_gen = tfx.components.CsvExampleGen( input_base=os.path.join(data_root, 'labelled'), input_config=examplegen_input_config, range_config=examplegen_range_config) # Computes statistics over data for visualization and example validation. statistics_gen = tfx.components.StatisticsGen( examples=example_gen.outputs['examples']) if user_provided_schema_path: # Import user-provided schema. schema_gen = tfx.components.ImportSchemaGen( schema_file=user_provided_schema_path) # Performs anomaly detection based on statistics and data schema. example_validator = tfx.components.ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) else: # Generates schema based on statistics files. schema_gen = tfx.components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Gets multiple Spans for transform and training. if resolver_range_config: examples_resolver = tfx.dsl.Resolver( strategy_class=tfx.dsl.experimental.SpanRangeStrategy, config={ 'range_config': resolver_range_config }, examples=tfx.dsl.Channel( type=tfx.types.standard_artifacts.Examples, producer_component_id=example_gen.id)).with_id('span_resolver') # Performs transformations and feature engineering in training and serving. if enable_transform_input_cache: transform_cache_resolver = tfx.dsl.Resolver( strategy_class=tfx.dsl.experimental.LatestArtifactStrategy, cache=tfx.dsl.Channel( type=tfx.types.standard_artifacts.TransformCache)).with_id( 'transform_cache_resolver') tft_resolved_cache = transform_cache_resolver.outputs['cache'] else: tft_resolved_cache = None transform = tfx.components.Transform( examples=(examples_resolver.outputs['examples'] if resolver_range_config else example_gen.outputs['examples']), schema=schema_gen.outputs['schema'], module_file=module_file, analyzer_cache=tft_resolved_cache) # Tunes the hyperparameters for model training based on user-provided Python # function. Note that once the hyperparameters are tuned, you can drop the # Tuner component from pipeline and feed Trainer with tuned hyperparameters. if enable_tuning: tuner = tfx.components.Tuner( module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], train_args=tfx.proto.TrainArgs(num_steps=20), eval_args=tfx.proto.EvalArgs(num_steps=5)) # Uses user-provided Python function that trains a model. trainer = tfx.components.Trainer( module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], # If Tuner is in the pipeline, Trainer can take Tuner's output # best_hyperparameters artifact as input and utilize it in the user module # code. # # If there isn't Tuner in the pipeline, either use Importer to import # a previous Tuner's output to feed to Trainer, or directly use the tuned # hyperparameters in user module code and set hyperparameters to None # here. # # Example of Importer, # hparams_importer = Importer( # source_uri='path/to/best_hyperparameters.txt', # artifact_type=HyperParameters).with_id('import_hparams') # ... # hyperparameters = hparams_importer.outputs['result'], hyperparameters=(tuner.outputs['best_hyperparameters'] if enable_tuning else None), train_args=tfx.proto.TrainArgs(num_steps=100), eval_args=tfx.proto.EvalArgs(num_steps=5)) # Get the latest blessed model for model validation. model_resolver = tfx.dsl.Resolver( strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy, model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model), model_blessing=tfx.dsl.Channel( type=tfx.types.standard_artifacts.ModelBlessing)).with_id( 'latest_blessed_model_resolver') # Uses TFMA to compute evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(signature_name='serving_default', label_key='species_xf', preprocessing_function_names=['transform_features']) ], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': accuracy_threshold}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = tfx.components.Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Components declared within the conditional block will only be triggered # if the Predicate evaluates to True. # # In the example below, # evaluator.outputs['blessing'].future()[0].custom_property('blessed') == 1 # is a Predicate, which will be evaluated during runtime. # # - evaluator.outputs['blessing'] is the output Channel 'blessing'. # - .future() turns the Channel into a Placeholder. # - [0] gets the first artifact from the 'blessing' Channel. # - .custom_property('blessed') gets a custom property called 'blessed' from # that artifact. # - == 1 compares that property with 1. (An explicit comparison is needed. # There's no automatic boolean conversion based on truthiness.) # # Note these operations are just placeholder, something like Mocks. They are # not evaluated until runtime. For more details, see tfx/dsl/placeholder/. with conditional.Cond(evaluator.outputs['blessing'].future() [0].custom_property('blessed') == 1): # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = tfx.components.Pusher( model=trainer.outputs['model'], # No need to pass model_blessing any more, since Pusher is already # guarded by a Conditional. # model_blessing=evaluator.outputs['blessing'], push_destination=tfx.proto.PushDestination( filesystem=tfx.proto.PushDestination.Filesystem( base_directory=serving_model_dir))) # Showcase for BulkInferrer component. if enable_bulk_inferrer: # Generates unlabelled examples. example_gen_unlabelled = tfx.components.CsvExampleGen( input_base=os.path.join(data_root, 'unlabelled')).with_id( 'CsvExampleGen_Unlabelled') # Performs offline batch inference. bulk_inferrer = tfx.components.BulkInferrer( examples=example_gen_unlabelled.outputs['examples'], model=trainer.outputs['model'], # Empty data_spec.example_splits will result in using all splits. data_spec=tfx.proto.DataSpec(), model_spec=tfx.proto.ModelSpec()) components_list = [ example_gen, statistics_gen, schema_gen, transform, trainer, model_resolver, evaluator, pusher, ] if resolver_range_config: components_list.append(examples_resolver) if enable_transform_input_cache: components_list.append(transform_cache_resolver) if enable_tuning: components_list.append(tuner) if enable_bulk_inferrer: components_list.append(example_gen_unlabelled) components_list.append(bulk_inferrer) if user_provided_schema_path: components_list.append(example_validator) return tfx.dsl.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components_list, enable_cache=True, metadata_connection_config=tfx.orchestration.metadata. sqlite_metadata_connection_config(metadata_path), beam_pipeline_args=beam_pipeline_args)
def create_pipeline_components( pipeline_root: str, transform_module: str, trainer_module: str, bigquery_query: str = '', csv_input_location: str = '', ) -> List[base_node.BaseNode]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. bigquery_query: The query to get input data from BigQuery. If not empty, BigQueryExampleGen will be used. csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ if bool(bigquery_query) == bool(csv_input_location): raise ValueError( 'Exactly one example gen is expected. ', 'Please provide either bigquery_query or csv_input_location.') if bigquery_query: example_gen = tfx.extensions.google_cloud_big_query.BigQueryExampleGen( query=bigquery_query) else: example_gen = tfx.components.CsvExampleGen( input_base=csv_input_location) statistics_gen = tfx.components.StatisticsGen( examples=example_gen.outputs['examples']) schema_gen = tfx.components.SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = tfx.components.ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = tfx.components.Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = tfx.dsl.Resolver( strategy_class=tfx.dsl.experimental.LatestArtifactStrategy, model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model )).with_id('Resolver.latest_model_resolver') trainer = tfx.components.Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(Executor), examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['model'], transform_graph=transform.outputs['transform_graph'], train_args=tfx.proto.TrainArgs(num_steps=10), eval_args=tfx.proto.EvalArgs(num_steps=5), module_file=trainer_module, ) # Get the latest blessed model for model validation. model_resolver = tfx.dsl.Resolver( strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy, model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model), model_blessing=tfx.dsl.Channel( type=tfx.types.standard_artifacts.ModelBlessing)).with_id( 'Resolver.latest_blessed_model_resolver') # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = tfx.components.Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) with conditional.Cond(evaluator.outputs['blessing'].future() [0].custom_property('blessed') == 1): pusher = tfx.components.Pusher( model=trainer.outputs['model'], push_destination=tfx.proto. PushDestination(filesystem=tfx.proto.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, model_resolver, evaluator, pusher ]
def create_test_pipeline(): """Builds a conditional pipeline.""" pipeline_name = "cond" cond_root = "cond_root" serving_model_dir = os.path.join(cond_root, "serving_model", pipeline_name) tfx_root = "tfx_root" data_path = os.path.join(tfx_root, "data_path") pipeline_root = os.path.join(tfx_root, "pipelines", pipeline_name) example_gen = CsvExampleGen(input_base=data_path) statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"]) schema_gen = SchemaGen(statistics=statistics_gen.outputs["statistics"]) trainer = Trainer( module_file="module_file", custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], train_args=trainer_pb2.TrainArgs(num_steps=2000), eval_args=trainer_pb2.EvalArgs(num_steps=5)) eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name="eval")], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ "sparse_categorical_accuracy": tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={"value": 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={"value": -1e-10})) }) ]) evaluator = Evaluator(examples=example_gen.outputs["examples"], model=trainer.outputs["model"], eval_config=eval_config) with conditional.Cond( evaluator.outputs["blessing"].future()[0].value == 1): infra_validator = InfraValidator( model=trainer.outputs["model"], examples=example_gen.outputs["examples"], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=["latest"]), local_docker=infra_validator_pb2.LocalDockerConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) with conditional.Cond( ph.logical_and( infra_validator.outputs["blessing"].future()[0].value == 1, trainer.outputs["model"].future()[0].uri != "")): # pylint: disable=g-explicit-bool-comparison pusher = Pusher( model=trainer.outputs["model"], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, trainer, evaluator, infra_validator, pusher ], enable_cache=True, beam_pipeline_args=["--my_testing_beam_pipeline_args=foo"], execution_mode=pipeline.ExecutionMode.SYNC)