def test_do(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. train_examples = types.TfxArtifact(type_name='ExamplesPath', split='train') eval_examples = types.TfxArtifact(type_name='ExamplesPath', split='eval') eval_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/eval/') model_exports = types.TfxArtifact(type_name='ModelExportPath') model_exports.uri = os.path.join(source_data_dir, 'trainer/current/') input_dict = { 'examples': [train_examples, eval_examples], 'model_exports': [model_exports], } # Create output dict. eval_output = types.TfxArtifact('ModelEvalPath') eval_output.uri = os.path.join(output_data_dir, 'eval_output') output_dict = {'output': [eval_output]} # Create exec proterties. exec_properties = { 'feature_slicing_spec': json_format.MessageToJson( evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']), evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_day', 'trip_miles']), ])) } # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( tf.gfile.Exists(os.path.join(eval_output.uri, 'eval_config'))) self.assertTrue( tf.gfile.Exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue(tf.gfile.Exists(os.path.join(eval_output.uri, 'plots')))
def testDo(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. train_examples = standard_artifacts.Examples(split='train') eval_examples = standard_artifacts.Examples(split='eval') eval_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/eval/') model_exports = standard_artifacts.Model() model_exports.uri = os.path.join(source_data_dir, 'trainer/current/') input_dict = { 'examples': [train_examples, eval_examples], 'model_exports': [model_exports], } # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') output_dict = {'output': [eval_output]} # Create exec proterties. exec_properties = { 'feature_slicing_spec': json_format.MessageToJson( evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']), evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_day', 'trip_miles']), ])) } # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( # TODO(b/141490237): Update to only check eval_config.json after TFMA # released with corresponding change. tf.gfile.Exists(os.path.join(eval_output.uri, 'eval_config')) or tf.gfile.Exists(os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue(tf.gfile.Exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue(tf.gfile.Exists(os.path.join(eval_output.uri, 'plots')))
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, output_bucket: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(output_bucket, 'model_serving')))) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_simple', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def test_construct_with_slice_spec(self): examples = types.Artifact(type_name='ExamplesPath') model_exports = types.Artifact(type_name='ModelExportPath') evaluator = component.Evaluator( examples=channel.as_channel([examples]), model_exports=channel.as_channel([model_exports]), feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertEqual('ModelEvalPath', evaluator.outputs.output.type_name)
def testConstructWithSliceSpec(self): examples = standard_artifacts.Examples() model_exports = standard_artifacts.Model() evaluator = component.Evaluator( examples=channel_utils.as_channel([examples]), model_exports=channel_utils.as_channel([model_exports]), feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertEqual('ModelEvalPath', evaluator.outputs.output.type_name)
def testConstructWithFairnessThresholds(self): examples = standard_artifacts.Examples() model_exports = standard_artifacts.Model() evaluator = component.Evaluator( examples=channel_utils.as_channel([examples]), model_exports=channel_utils.as_channel([model_exports]), feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ]), fairness_indicator_thresholds=[0.1, 0.3, 0.5, 0.9]) self.assertEqual('ModelEvalPath', evaluator.outputs['output'].type_name)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text) -> pipeline.Pipeline: examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input_base=examples, input_config=input_split) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec() ])) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'tfx_image': 'tensorflow/tfx:0.14.0rc1' }, log_root='/var/tmp/tfx/logs', )
def testEvaluatorOnDataflowRunner(self): """Test for Evaluator on DataflowRunner invocation.""" pipeline_name = 'kubeflow-evaluator-dataflow-test-{}'.format( self._random_id()) pipeline = self._create_dataflow_pipeline(pipeline_name, [ Evaluator( examples=self._test_raw_examples, model_exports=self._test_model, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])), ]) self._compile_and_run_pipeline(pipeline)
def testEvaluatorOnDataflowRunner(self): """Evaluator-only test pipeline on DataflowRunner.""" pipeline_name = 'kubeflow-evaluator-dataflow-test-{}'.format( self._random_id()) pipeline = self._create_dataflow_pipeline(pipeline_name, [ self.raw_examples_importer, self.model_1_importer, Evaluator( examples=self.raw_examples_importer.outputs['result'], model=self.model_1_importer.outputs['result'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) ]) self._compile_and_run_pipeline(pipeline)
def testEvaluatorOnDataflowRunner(self): """Evaluator-only test pipeline on DataflowRunner.""" pipeline_name = 'kubeflow-evaluator-dataflow-test-{}'.format( self._random_id()) pipeline = self._create_dataflow_pipeline(pipeline_name, [ Evaluator( examples=self._input_artifacts(pipeline_name, self._test_raw_examples), model_exports=self._input_artifacts(pipeline_name, self._test_model), feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) ]) self._compile_and_run_pipeline(pipeline)
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" query = """ SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day, UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE RAND() < {}""".format(_query_sample_rate) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': _cmle_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': _cmle_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" training_examples = external_input(training_data_root) # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen( input_base=training_examples, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=training_example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=training_example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=training_example_gen.outputs['examples'], model=trainer.outputs['model']) inference_examples = external_input(inference_data_root) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_examples, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[example_gen_pb2.SplitConfig.Split( name='unlabelled', hash_buckets=100)])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=_query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( executor_class=ai_platform_trainer_executor.Executor, module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ 'ai_platform_training_args': _ai_platform_training_args }) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=_taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': _ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher(executor_class=ai_platform_pusher_executor.Executor, model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={ 'ai_platform_serving_args': _ai_platform_serving_args }, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': _ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_kubeflow', pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'beam_pipeline_args': [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + _project_id, '--temp_location=' + os.path.join(_output_bucket, 'tmp'), '--region=' + _gcp_region, ], # Optional args: # 'tfx_image': custom docker image to use for components. # This is needed if TFX package is not installed from an RC # or released version. }, log_root='/var/tmp/tfx/logs', )
def _create_test_pipeline(pipeline_name: Text, pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, container_image: Text): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline for testing. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. container_image: The container image to use. Returns: A logical TFX pipeline.Pipeline object. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( # pylint: disable=unused-variable stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( # pylint: disable=unused-variable examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( # pylint: disable=unused-variable model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return tfx_pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], log_root='/var/tmp/tfx/logs', additional_pipeline_args={ 'tfx_image': container_image, }, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. # pylint: disable=line-too-long statistics_gen = StatisticsGen( input_data=example_gen.outputs.examples) # Step 3 # pylint: enable=line-too-long # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( # Step 3 stats=statistics_gen.outputs.output, # Step 3 schema=infer_schema.outputs.output) # Step 3 # Performs transformations and feature engineering in training and serving. transform = Transform( # Step 4 input_data=example_gen.outputs.examples, # Step 4 schema=infer_schema.outputs.output, # Step 4 module_file=_taxi_module_file) # Step 4 # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( # Step 5 module_file=_taxi_module_file, # Step 5 transformed_examples=transform.outputs.transformed_examples, # Step 5 schema=infer_schema.outputs.output, # Step 5 transform_output=transform.outputs.transform_output, # Step 5 train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5 eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5 # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( # Step 6 examples=example_gen.outputs.examples, # Step 6 model_exports=trainer.outputs.output, # Step 6 feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6 evaluator_pb2.SingleSlicingSpec( # Step 6 column_for_slicing=['trip_start_hour']) # Step 6 ])) # Step 6 # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( # Step 7 examples=example_gen.outputs.examples, # Step 7 model=trainer.outputs.output) # Step 7 # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( # Step 7 model_export=trainer.outputs.output, # Step 7 model_blessing=model_validator.outputs.blessing, # Step 7 push_destination=pusher_pb2.PushDestination( # Step 7 filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7 base_directory=_serving_model_dir))) # Step 7 return pipeline.Pipeline( pipeline_name='taxi', pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, # Step 3 transform, # Step 4 trainer, # Step 5 # model_analyzer, # Step 6 # model_validator, pusher # Step 7 ], enable_cache=True, metadata_db_root=_metadata_db_root, additional_pipeline_args={'logger_args': logger_overrides}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], additional_pipeline_args={}, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, slack_token=_slack_token, channel_id=_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=slack_validator.outputs.slack_blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ]
def _create_pipeline(pipeline_root: Text, csv_input_location: data_types.RuntimeParameter, taxi_module_file: data_types.RuntimeParameter, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False, ) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema'], ) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=taxi_module_file, ) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ]), ) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(pipeline.ROOT_PARAMETER), 'model_serving'))), ) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def create_pipeline(pipeline_name, pipeline_root, input_path, tf_transform_file, tf_trainer_file, serving_model_basedir, **kwargs): examples = tfrecord_input(input_path) input_config = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='tfrecord', pattern='data_tfrecord-*.gz'), ]) # todo add as airflow var output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name='train', hash_buckets=2), # todo add as airflow var example_gen_pb2.SplitConfig.Split( name='eval', hash_buckets=1) # todo add as airflow var ])) example_gen = ImportExampleGen(input_base=examples, input_config=input_config, output_config=output_config) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=tf_transform_file) trainer = Trainer( module_file=tf_trainer_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=[]) # todo add your slicing column ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_basedir))) pipeline = Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, **kwargs) pipeline.components = [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ] return pipeline
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, serving_model_dir: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. try: from tfx.extensions.google_cloud_ai_platform.trainer import executor as ai_platform_trainer_executor # pylint: disable=g-import-not-at-top # Train using a custom executor. This requires TFX >= 0.14. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={ 'ai_platform_training_args': ai_platform_training_args }) except ImportError: # Train using a deprecated flag. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'cmle_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a destination if check passed. try: from tfx.extensions.google_cloud_ai_platform.pusher import executor as ai_platform_pusher_executor # pylint: disable=g-import-not-at-top # Deploy the model on Google Cloud AI Platform. This requires TFX >=0.14. pusher = Pusher(custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={ 'ai_platform_serving_args': ai_platform_serving_args }) except ImportError: # Deploy the model on Google Cloud AI Platform, using a deprecated flag. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, custom_config={'cmle_serving_args': ai_platform_serving_args}, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ 'beam_pipeline_args': beam_pipeline_args, }, log_root='/var/tmp/tfx/logs', )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the cifar10 pipeline with TFX.""" examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input=examples, input_config=input_split) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec( specs=[evaluator_pb2.SingleSlicingSpec()])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={ # LINT.IfChange 'beam_pipeline_args': [ # ----- Beam Args -----. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_(flink|spark).sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment. # pylint: disable=g-bad-todo # TODO(BEAM-5167): Use concurrency information from SDK Harness. # pylint: disable=g-bad-todo # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam print the relevant message in the log. '--experiments=worker_threads=100', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # ----- Flink runner-specific Args -----. # TODO(b/126725506): Set the task parallelism based on cpu cores. # TODO(FLINK-10672): Obviate setting BATCH_FORCED. '--execution_mode_for_batch=BATCH_FORCED', ], # LINT.ThenChange(setup/setup_beam_on_spark.sh) # LINT.ThenChange(../chicago_taxi/setup_beam_on_flink.sh) })
class ExecutorTest(tf.test.TestCase, absl.testing.parameterized.TestCase): # TODO(jinhuang): add test for eval_saved_model when supported. @absl.testing.parameterized.named_parameters(('eval_config', { 'eval_config': json_format.MessageToJson(tfma.EvalConfig(slicing_specs=[ tfma.SlicingSpec(feature_keys=['trip_start_hour']), tfma.SlicingSpec(feature_keys=['trip_start_day', 'trip_miles']), ]), preserving_proto_field_name=True) }), ('eval_config_w_baseline', { 'eval_config': json_format.MessageToJson(tfma.EvalConfig( model_specs=[ tfma.ModelSpec(name='baseline', is_baseline=True), tfma.ModelSpec(name='candidate'), ], slicing_specs=[ tfma.SlicingSpec(feature_keys=['trip_start_hour']), tfma.SlicingSpec( feature_keys=['trip_start_day', 'trip_miles']), ]), preserving_proto_field_name=True) }), ('legacy_feature_slicing', { 'feature_slicing_spec': json_format.MessageToJson(evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']), evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_day', 'trip_miles']), ]), preserving_proto_field_name=True), })) def testDo(self, exec_properties): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) model = standard_artifacts.Model() baseline_model = standard_artifacts.Model() model.uri = os.path.join(source_data_dir, 'trainer/current') baseline_model.uri = os.path.join(source_data_dir, 'trainer/previous/') input_dict = { executor.EXAMPLES_KEY: [examples], executor.MODEL_KEY: [model], executor.BASELINE_MODEL_KEY: [baseline_model], } # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') output_dict = { executor.EVALUATION_KEY: [eval_output], } # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( tf.io.gfile.exists( os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue( tf.io.gfile.exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue( tf.io.gfile.exists(os.path.join(eval_output.uri, 'plots'))) @absl.testing.parameterized.named_parameters(('legacy_feature_slicing', { 'feature_slicing_spec': json_format.MessageToJson(evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']), evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_day', 'trip_miles']), ]), preserving_proto_field_name=True), })) def testDoLegacySingleEvalSavedModelWFairness(self, exec_properties): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) model = standard_artifacts.Model() baseline_model = standard_artifacts.Model() model.uri = os.path.join(source_data_dir, 'trainer/current') baseline_model.uri = os.path.join(source_data_dir, 'trainer/previous/') input_dict = { executor.EXAMPLES_KEY: [examples], executor.MODEL_KEY: [model], } # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') output_dict = {executor.EVALUATION_KEY: [eval_output]} try: # Need to import the following module so that the fairness indicator # post-export metric is registered. This may raise an ImportError if the # currently-installed version of TFMA does not support fairness # indicators. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable exec_properties['fairness_indicator_thresholds'] = [ 0.1, 0.3, 0.5, 0.7, 0.9 ] except ImportError: absl.logging.warning( 'Not testing fairness indicators because a compatible TFMA version ' 'is not installed.') # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( tf.io.gfile.exists( os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue( tf.io.gfile.exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue( tf.io.gfile.exists(os.path.join(eval_output.uri, 'plots')))
class ExecutorTest(tf.test.TestCase, parameterized.TestCase): @parameterized.named_parameters(('evaluation_w_eval_config', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig(slicing_specs=[ tfma.SlicingSpec(feature_keys=['trip_start_hour']), tfma.SlicingSpec( feature_keys=['trip_start_day', 'trip_miles']), ])) }), ('evaluation_w_module_file', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig(slicing_specs=[ tfma.SlicingSpec(feature_keys=['trip_start_hour']), tfma.SlicingSpec( feature_keys=['trip_start_day', 'trip_miles']), ])), 'module_file': None }), ('evaluation_w_module_path', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig(slicing_specs=[ tfma.SlicingSpec(feature_keys=['trip_start_hour']), tfma.SlicingSpec( feature_keys=['trip_start_day', 'trip_miles']), ])), 'module_path': evaluator_module.__name__, })) def testEvalution(self, exec_properties): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) model = standard_artifacts.Model() baseline_model = standard_artifacts.Model() model.uri = os.path.join(source_data_dir, 'trainer/current') baseline_model.uri = os.path.join(source_data_dir, 'trainer/previous/') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { constants.EXAMPLES_KEY: [examples], constants.MODEL_KEY: [model], constants.SCHEMA_KEY: [schema], } # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') blessing_output = standard_artifacts.ModelBlessing() blessing_output.uri = os.path.join(output_data_dir, 'blessing_output') output_dict = { constants.EVALUATION_KEY: [eval_output], constants.BLESSING_KEY: [blessing_output], } # Test multiple splits. exec_properties[constants.EXAMPLE_SPLITS_KEY] = json_utils.dumps( ['train', 'eval']) if 'module_file' in exec_properties: exec_properties['module_file'] = os.path.join(source_data_dir, 'module_file', 'evaluator_module.py') # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( fileio.exists(os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'plots'))) self.assertFalse( fileio.exists(os.path.join(blessing_output.uri, 'BLESSED'))) @parameterized.named_parameters(('legacy_feature_slicing', { 'feature_slicing_spec': proto_utils.proto_to_json( evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']), evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_day', 'trip_miles']), ])), })) def testDoLegacySingleEvalSavedModelWFairness(self, exec_properties): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) model = standard_artifacts.Model() model.uri = os.path.join(source_data_dir, 'trainer/current') input_dict = { constants.EXAMPLES_KEY: [examples], constants.MODEL_KEY: [model], } # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') blessing_output = standard_artifacts.ModelBlessing() blessing_output.uri = os.path.join(output_data_dir, 'blessing_output') output_dict = { constants.EVALUATION_KEY: [eval_output], constants.BLESSING_KEY: [blessing_output], } try: # Need to import the following module so that the fairness indicator # post-export metric is registered. This may raise an ImportError if the # currently-installed version of TFMA does not support fairness # indicators. import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators # pylint: disable=g-import-not-at-top, unused-variable exec_properties['fairness_indicator_thresholds'] = [ 0.1, 0.3, 0.5, 0.7, 0.9 ] except ImportError: logging.warning( 'Not testing fairness indicators because a compatible TFMA version ' 'is not installed.') # List needs to be serialized before being passed into Do function. exec_properties[constants.EXAMPLE_SPLITS_KEY] = json_utils.dumps(None) # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( fileio.exists(os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'plots'))) self.assertFalse( fileio.exists(os.path.join(blessing_output.uri, 'BLESSED'))) @parameterized.named_parameters( ( 'eval_config_w_validation', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig( model_specs=[ tfma.ModelSpec(label_key='tips'), ], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.config.MetricConfig( class_name='ExampleCount', # Count > 0, OK. threshold=tfma.config.MetricThreshold( value_threshold=tfma .GenericValueThreshold( lower_bound={'value': 0}))), ]), ], slicing_specs=[tfma.SlicingSpec()])) }, True, True), ( 'eval_config_w_validation_fail', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig( model_specs=[ tfma.ModelSpec( name='baseline1', label_key='tips', is_baseline=True), tfma.ModelSpec( name='candidate1', label_key='tips'), ], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.config.MetricConfig( class_name='ExampleCount', # Count < -1, NOT OK. threshold=tfma.config.MetricThreshold( value_threshold=tfma .GenericValueThreshold( upper_bound={'value': -1}))), ]), ], slicing_specs=[tfma.SlicingSpec()])) }, False, True), ( 'no_baseline_model_ignore_change_threshold_validation_pass', { 'eval_config': proto_utils.proto_to_json( tfma.EvalConfig( model_specs=[ tfma.ModelSpec( name='baseline', label_key='tips', is_baseline=True), tfma.ModelSpec( name='candidate', label_key='tips'), ], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.config.MetricConfig( class_name='ExampleCount', # Count > 0, OK. threshold=tfma.config.MetricThreshold( value_threshold=tfma .GenericValueThreshold( lower_bound={'value': 0}))), tfma.config.MetricConfig( class_name='Accuracy', # Should be ignored due to no baseline. threshold=tfma.config.MetricThreshold( change_threshold=tfma .GenericChangeThreshold( relative={'value': 0}, direction=tfma.MetricDirection .LOWER_IS_BETTER))), ]), ], slicing_specs=[tfma.SlicingSpec()])) }, True, False)) def testDoValidation(self, exec_properties, blessed, has_baseline): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'csv_example_gen') examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) model = standard_artifacts.Model() baseline_model = standard_artifacts.Model() model.uri = os.path.join(source_data_dir, 'trainer/current') baseline_model.uri = os.path.join(source_data_dir, 'trainer/previous/') blessing_output = standard_artifacts.ModelBlessing() blessing_output.uri = os.path.join(output_data_dir, 'blessing_output') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { constants.EXAMPLES_KEY: [examples], constants.MODEL_KEY: [model], constants.SCHEMA_KEY: [schema], } if has_baseline: input_dict[constants.BASELINE_MODEL_KEY] = [baseline_model] # Create output dict. eval_output = standard_artifacts.ModelEvaluation() eval_output.uri = os.path.join(output_data_dir, 'eval_output') blessing_output = standard_artifacts.ModelBlessing() blessing_output.uri = os.path.join(output_data_dir, 'blessing_output') output_dict = { constants.EVALUATION_KEY: [eval_output], constants.BLESSING_KEY: [blessing_output], } # List needs to be serialized before being passed into Do function. exec_properties[constants.EXAMPLE_SPLITS_KEY] = json_utils.dumps(None) # Run executor. evaluator = executor.Executor() evaluator.Do(input_dict, output_dict, exec_properties) # Check evaluator outputs. self.assertTrue( fileio.exists(os.path.join(eval_output.uri, 'eval_config.json'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'metrics'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'plots'))) self.assertTrue(fileio.exists(os.path.join(eval_output.uri, 'validations'))) if blessed: self.assertTrue( fileio.exists(os.path.join(blessing_output.uri, 'BLESSED'))) else: self.assertTrue( fileio.exists(os.path.join(blessing_output.uri, 'NOT_BLESSED')))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, user_schema_path: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Import user-provided schema. user_schema_importer = ImporterNode(instance_name='import_user_schema', source_uri=user_schema_path, artifact_type=Schema) # Generates schema based on statistics files. Even we use user-provided schema # in downstream components, we still want to generate the schema of the newest # data so that user can compare and optionally update the schema to use. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=user_schema_importer.outputs['result']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=user_schema_importer.outputs['result'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=user_schema_importer.outputs['result'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, user_schema_importer, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, module_file: Text, presto_config: presto_config_pb2.PrestoConnConfig, query: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data example_gen = PrestoExampleGen(presto_config, query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs['output']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs['output'], schema=infer_schema.outputs['output']) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs['examples'], schema=infer_schema.outputs['output'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['output'], transform_output=transform.outputs['transform_output'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['output'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['output']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, )
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], bigquery_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'ai_platform_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud BigQuery ML if check passed. pusher = Pusher( custom_executor_spec=executor_spec.ExecutorClassSpec( bigquery_ml_pusher_executor.Executor), model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], custom_config={'bigquery_serving_args': bigquery_serving_args}) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, model_validator, pusher ], beam_pipeline_args=beam_pipeline_args, )