def _create_pipeline(): """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_kubeflow', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema], additional_pipeline_args={ 'beam_pipeline_args': [ '--runner=DataflowRunner', '--experiments=shuffle_mode=auto', '--project=' + _project_id, '--temp_location=' + os.path.join(_output_dir, 'tmp'), '--region=' + _gcp_region, ], }, log_root='/var/tmp/tfx/logs', )
def create_pipeline(): # Read data in; can split data here examples = csv_input(DATA_DIR) example_gen = CsvExampleGen(input_base=examples, name='iris_example') # Generate feature statistics statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Infer schema for data infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Identify anomomalies in training and serving data validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs feature engineering; emits a SavedModel that does preprocessing transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=TRANSFORM_MODULE_FILE) # Trains a model trainer = Trainer( module_file=MODEL_MODULE_FILE, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Evaluates the model on different slices of the data (bias detection?!) model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Compares new model against a baseline; both models evaluated on a dataset model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Pushes a blessed model to a deployment target (tfserving) pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=SERVING_DIR))) return pipeline.Pipeline(pipeline_name=PIPELINE_NAME, pipeline_root=DAGS_DIR, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_db_root=METADATA_DIR, additional_pipeline_args={ 'logger_args': { 'log_root': LOGS_DIR, 'log_level': logging.INFO } })
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) return pipeline.Pipeline( pipeline_name='chicago_taxi_simple', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema, validate_stats], enable_cache=True, metadata_db_root=_metadata_db_root, )
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, output_bucket: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output, infer_feature_shape=False) validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(output_bucket, 'model_serving')))) return pipeline.Pipeline( pipeline_name='chicago_taxi_pipeline_simple', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=transform_module) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def testCsvExampleGenOnDataflowRunner(self): """CsvExampleGen-only test pipeline on DataflowRunner invocation.""" pipeline_name = 'kubeflow-csv-example-gen-dataflow-test-{}'.format( self._random_id()) pipeline = self._create_dataflow_pipeline(pipeline_name, [ CsvExampleGen(input=dsl_utils.csv_input(self._data_root)), ]) self._compile_and_run_pipeline(pipeline)
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3 # Generates schema based on statistics files. # schema_gen = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 # Performs anomaly detection based on statistics and data schema. # validate_stats = ExampleValidator( # Step 3 # stats=statistics_gen.outputs.output, # Step 3 # schema=schema_gen.outputs.output) # Step 3 # Performs transformations and feature engineering in training and serving. # transform = Transform( # Step 4 # input_data=example_gen.outputs.examples, # Step 4 # schema=schema_gen.outputs.output, # Step 4 # module_file=taxi_module_file) # Step 4 # Uses user-provided Python function that implements a model using TF-Learn. # trainer = Trainer( # Step 5 # module_file=taxi_module_file, # Step 5 # transformed_examples=transform.outputs.transformed_examples, # Step 5 # schema=schema_gen.outputs.output, # Step 5 # transform_output=transform.outputs.transform_output, # Step 5 # train_steps=10000, # Step 5 # eval_steps=5000, # Step 5 # warm_starting=True) # Step 5 # Uses TFMA to compute a evaluation statistics over features of a model. # model_analyzer = Evaluator( # Step 6 # examples=example_gen.outputs.examples, # Step 6 # model_exports=trainer.outputs.output) # Step 6 # Performs quality validation of a candidate model (compared to a baseline). # model_validator = ModelValidator( # Step 7 # examples=example_gen.outputs.examples, model=trainer.outputs.output) # Step 7 # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. # pusher = Pusher( # Step 7 # model_export=trainer.outputs.output, # Step 7 # model_blessing=model_validator.outputs.blessing, # Step 7 # serving_model_dir=serving_model_dir) # Step 7 return [ example_gen, # statistics_gen, schema_gen, validate_stats, # Step 3 # transform, # Step 4 # trainer, # Step 5 # model_analyzer, # Step 6 # model_validator, pusher # Step 7 ]
def create_pipeline(): """Implements the titanic taxi pipeline with TFX.""" examples = csv_input(data_dir) # Brings data into the pipeline example_gen = CsvExampleGen(input_base=examples) return [example_gen]
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(os.path.join(data_root, 'simple')) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_steps=10000, eval_steps=5000, warm_starting=True) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, serving_model_dir=serving_model_dir) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def __init__(self, base_dir, csvname): self.base_dir = base_dir self.csvname = csvname self.components = [] examples = csv_input(os.path.join(self.base_dir, self.csvname)) self.example_gen = tfx.components.example_gen.csv_example_gen.component.CsvExampleGen( input=examples) self.statistics_gen = StatisticsGen( self.example_gen.outputs['examples'], instance_name=self.csvname + "_statistics_gen") self.scheme_gen = SchemaGen( statistics=self.statistics_gen.outputs['statistics']) self.valid_stats = ExampleValidator( statistics=self.statistics_gen.outputs["statistics"], schema=self.scheme_gen.outputs["schema"])
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) return pipeline.Pipeline( pipeline_name='chicago_taxi_simple', pipeline_root=_pipeline_root, components=[example_gen, statistics_gen, infer_schema], enable_cache=True, metadata_db_root=_metadata_db_root, )
def csv_input(self): [csv] = dsl_utils.csv_input(uri='path') self.assertEqual('ExternalPath', csv.type_name) self.assertEqual('path', csv.uri)
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( _metadata_db_root), components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], additional_pipeline_args={ # LINT.IfChange 'beam_pipeline_args': [ # ----- Beam Args -----. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_(flink|spark).sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment. # pylint: disable=g-bad-todo # TODO(BEAM-5167): Use concurrency information from SDK Harness. # pylint: disable=g-bad-todo # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam print the relevant message in the log. '--experiments=worker_threads=100', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # ----- Flink runner-specific Args -----. # TODO(b/126725506): Set the task parallelism based on cpu cores. # TODO(FLINK-10672): Obviate setting BATCH_FORCED. '--execution_mode_for_batch=BATCH_FORCED', ], # LINT.ThenChange(tfx/examples/chicago_taxi/setup_beam_on_portable_beam.sh) }, )
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=_taxi_module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=_taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, slack_token=_slack_token, channel_id=_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model_export=trainer.outputs.output, model_blessing=slack_validator.outputs.slack_blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ]
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) # Computes statistics over data for visualization and example validation. # pylint: disable=line-too-long # statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Step 3 # pylint: enable=line-too-long # Generates schema based on statistics files. # infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Step 3 # Performs anomaly detection based on statistics and data schema. # validate_stats = ExampleValidator( # Step 3 # stats=statistics_gen.outputs.output, # Step 3 # schema=infer_schema.outputs.output) # Step 3 # Performs transformations and feature engineering in training and serving. # transform = Transform( # Step 4 # input_data=example_gen.outputs.examples, # Step 4 # schema=infer_schema.outputs.output, # Step 4 # module_file=_taxi_module_file) # Step 4 # Uses user-provided Python function that implements a model using TF-Learn. # trainer = Trainer( # Step 5 # module_file=_taxi_module_file, # Step 5 # transformed_examples=transform.outputs.transformed_examples, # Step 5 # schema=infer_schema.outputs.output, # Step 5 # transform_output=transform.outputs.transform_output, # Step 5 # train_args=trainer_pb2.TrainArgs(num_steps=10000), # Step 5 # eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Step 5 # Uses TFMA to compute a evaluation statistics over features of a model. # model_analyzer = Evaluator( # Step 6 # examples=example_gen.outputs.examples, # Step 6 # model_exports=trainer.outputs.output, # Step 6 # feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ # Step 6 # evaluator_pb2.SingleSlicingSpec( # Step 6 # column_for_slicing=['trip_start_hour']) # Step 6 # ])) # Step 6 # Performs quality validation of a candidate model (compared to a baseline). # model_validator = ModelValidator( # Step 7 # examples=example_gen.outputs.examples, # Step 7 # model=trainer.outputs.output) # Step 7 # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. # pusher = Pusher( # Step 7 # model_export=trainer.outputs.output, # Step 7 # model_blessing=model_validator.outputs.blessing, # Step 7 # push_destination=pusher_pb2.PushDestination( # Step 7 # filesystem=pusher_pb2.PushDestination.Filesystem( # Step 7 # base_directory=_serving_model_dir))) # Step 7 return pipeline.Pipeline( pipeline_name='taxi', pipeline_root=_pipeline_root, components=[ example_gen, # statistics_gen, infer_schema, validate_stats, # Step 3 # transform, # Step 4 # trainer, # Step 5 # model_analyzer, # Step 6 # model_validator, pusher # Step 7 ], enable_cache=True, metadata_db_root=_metadata_db_root, additional_pipeline_args={'logger_args': logger_overrides}, )
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = ResolverNode( instance_name='latest_model_resolver', resolver_class=latest_artifacts_resolver.LatestArtifactsResolver, latest_model=Channel(type=Model)) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['latest_model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'binary_accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), kubernetes=infra_validator_pb2.KubernetesConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, evaluator, infra_validator, pusher, ]
def _create_test_pipeline(pipeline_name: Text, pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, container_image: Text): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline for testing. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. container_image: The container image to use. Returns: A logical TFX pipeline.Pipeline object. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen(stats=statistics_gen.outputs.output) validate_stats = ExampleValidator( # pylint: disable=unused-variable stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) model_analyzer = Evaluator( # pylint: disable=unused-variable examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) pusher = Pusher( # pylint: disable=unused-variable model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return tfx_pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], log_root='/var/tmp/tfx/logs', additional_pipeline_args={ 'tfx_image': container_image, }, )
def _create_test_pipeline(pipeline_root: Text, csv_input_location: Text, taxi_module_file: Text, enable_cache: bool): """Creates a simple Kubeflow-based Chicago Taxi TFX pipeline. Args: pipeline_name: The name of the pipeline. pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. taxi_module_file: The location of the module file for Transform/Trainer. enable_cache: Whether to enable cache or not. Returns: A logical TFX pipeline.Pipeline object. """ examples = csv_input(csv_input_location) example_gen = CsvExampleGen(input_base=examples) statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) infer_schema = SchemaGen( stats=statistics_gen.outputs.output, infer_feature_shape=False) validate_stats = ExampleValidator( stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) transform = Transform( input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_module_file) trainer = Trainer( module_file=taxi_module_file, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5)) model_analyzer = Evaluator( examples=example_gen.outputs.examples, model_exports=trainer.outputs.output, feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator( examples=example_gen.outputs.examples, model=trainer.outputs.output) # Hack: ensuring push_destination can be correctly parameterized and interpreted. # pipeline root will be specified as a dsl.PipelineParam with the name # pipeline-root, see: # https://github.com/tensorflow/tfx/blob/1c670e92143c7856f67a866f721b8a9368ede385/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L226 _pipeline_root_param = dsl.PipelineParam(name='pipeline-root') pusher = Pusher( model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(str(_pipeline_root_param), 'model_serving')))) return pipeline.Pipeline( pipeline_name='parameterized_tfx_oss', pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=enable_cache, )