def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_inputs(os.path.join(base_dir, 'no_split/span_1')) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_data=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_pipeline_utils) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=taxi_pipeline_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_steps=10000, eval_steps=5000, warm_starting=True) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, serving_model_dir=serving_model_dir) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) self.assertIs(evaluator.outputs['evaluation'], evaluator.outputs['output']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # LINT.IfChange beam_pipeline_args=[ # ----- Beam Args -----. '--runner=PortableRunner', # Points to the job server started in # setup_beam_on_(flink|spark).sh '--job_endpoint=localhost:8099', '--environment_type=LOOPBACK', # TODO(BEAM-6754): Utilize multicore in LOOPBACK environment. # pylint: disable=g-bad-todo # TODO(BEAM-5167): Use concurrency information from SDK Harness. # pylint: disable=g-bad-todo # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam print the relevant message in the log. '--experiments=worker_threads=100', # TODO(BEAM-7199): Obviate the need for setting pre_optimize=all. # pylint: disable=g-bad-todo '--experiments=pre_optimize=all', # ----- Flink runner-specific Args -----. # TODO(b/126725506): Set the task parallelism based on cpu cores. # TODO(FLINK-10672): Obviate setting BATCH_FORCED. '--execution_mode_for_batch=BATCH_FORCED', ], # LINT.ThenChange(setup/setup_beam_on_spark.sh) # LINT.ThenChange(setup/setup_beam_on_flink.sh) )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=examples) hello = HelloComponent( input_data=example_gen.outputs['examples'], name='HelloWorld') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=hello.outputs['output_data']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=hello.outputs['output_data'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=hello.outputs['output_data'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=hello.outputs['output_data'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, hello, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path))
def _create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_input(_data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], preprocessing_fn=_taxi_transformer_func) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(trainer_fn=_taxi_trainer_func, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # This custom component serves as a bridge between pipeline and human model # reviewers to enable review-and-push workflow in model development cycle. It # utilizes Slack API to send message to user-defined Slack channel with model # URI info and wait for go / no-go decision from the same Slack channel: # * To approve the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'lgtm' or 'approve'. # * To reject the model, users need to reply the thread sent out by the bot # started by SlackComponent with 'decline' or 'reject'. slack_validator = SlackComponent( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], slack_token=_slack_token, slack_channel_id=_slack_channel_id, timeout_sec=3600, ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=slack_validator.outputs['slack_blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=_serving_model_dir))) return pipeline.Pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, slack_validator, pusher ], enable_cache=True, )
def _create_pipeline( pipeline_name: Text, pipeline_root: Text, query: Text, module_file: Text, beam_pipeline_args: List[Text], ai_platform_training_args: Dict[Text, Text], bigquery_serving_args: Dict[Text, Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), custom_config={'ai_platform_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud BigQuery ML if check passed. pusher = Pusher( custom_executor_spec=executor_spec.ExecutorClassSpec( bigquery_ml_pusher_executor.Executor), model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], custom_config={'bigquery_serving_args': bigquery_serving_args}) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, model_validator, pusher ], beam_pipeline_args=beam_pipeline_args, )
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" query = """ SELECT pickup_community_area, fare, EXTRACT(MONTH FROM trip_start_timestamp) AS trip_start_month, EXTRACT(HOUR FROM trip_start_timestamp) AS trip_start_hour, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS trip_start_day, UNIX_SECONDS(trip_start_timestamp) AS trip_start_timestamp, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, trip_miles, pickup_census_tract, dropoff_census_tract, payment_type, company, trip_seconds, dropoff_community_area, tips FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` WHERE MOD(FARM_FINGERPRINT(unique_key), 3) = 0 LIMIT 20000000""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_utils) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=taxi_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_steps=10000, eval_steps=5000, custom_config={'cmle_training_args': cmle_training_args}, warm_starting=True) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, serving_model_dir=serving_model_dir) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.csv_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = ResolverNode( instance_name='latest_model_resolver', resolver_class=latest_artifacts_resolver.LatestArtifactsResolver, latest_model=Channel(type=Model)) trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['latest_model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, evaluator, model_validator, pusher ]
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Hyperparameter tuning based on the tuner_fn in module_file. tuner = Tuner(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. # TODO(jyzhao): example for importing a hyperparameters file generated not in # currently run, e.g., by previous pipeline run with Tuner. # TODO(jyzhao): consider supporting warmstart from tuner's model for trainer. trainer = Trainer(module_file=module_file, examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], hyperparameters=tuner.outputs['best_hyperparameters'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, tuner, trainer, model_analyzer, model_validator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the cifar10 pipeline with TFX.""" examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input=examples, input_config=input_split) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec( specs=[evaluator_pb2.SingleSlicingSpec()])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. query: Text, preprocessing_fn: Text, trainer_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" components = [] # Brings data into the pipeline or otherwise joins/converts training data. # example_gen = CsvExampleGen(input=external_input(data_path)) # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. example_gen = BigQueryExampleGen(query=query) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline. components.append(statistics_gen) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # TODO(step 5): Uncomment here to add SchemaGen to the pipeline. components.append(infer_schema) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # TODO(step 5): Uncomment here to add ExampleValidator to the pipeline. components.append(validate_stats) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], preprocessing_fn=preprocessing_fn) # TODO(step 6): Uncomment here to add Transform to the pipeline. components.append(transform) # Uses user-provided Python function that implements a model using TF-Learn. trainer_args = { 'trainer_fn': trainer_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': infer_schema.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args } }) trainer = Trainer(**trainer_args) # TODO(step 6): Uncomment here to add Trainer to the pipeline. components.append(trainer) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( # pylint: disable=unused-variable examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # TODO(step 6): Uncomment here to add Evaluator to the pipeline. components.append(model_analyzer) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # TODO(step 6): Uncomment here to add ModelValidator to the pipeline. components.append(model_validator) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher_args = { 'model': trainer.outputs['model'], 'model_blessing': model_validator.outputs['blessing'], 'push_destination': pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), } if ai_platform_serving_args is not None: pusher_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), 'custom_config': { ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }, }) pusher = Pusher(**pusher_args) # pylint: disable=unused-variable # TODO(step 6): Uncomment here to add Pusher to the pipeline. components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX and Kubeflow Pipelines.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn # to train a model on Google Cloud AI Platform. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000), ) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to Google Cloud AI Platform if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%s' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" training_examples = external_input(training_data_root) # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen( input_base=training_examples, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=training_example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=training_example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=training_example_gen.outputs['examples'], model=trainer.outputs['model']) inference_examples = external_input(inference_data_root) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_examples, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[example_gen_pb2.SplitConfig.Split( name='unlabelled', hash_buckets=100)])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, user_schema_path: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Import user-provided schema. user_schema_importer = ImporterNode(instance_name='import_user_schema', source_uri=user_schema_path, artifact_type=Schema) # Generates schema based on statistics files. Even we use user-provided schema # in downstream components, we still want to generate the schema of the newest # data so that user can compare and optionally update the schema to use. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=user_schema_importer.outputs['result']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=user_schema_importer.outputs['result'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=user_schema_importer.outputs['result'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, user_schema_importer, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: str, pipeline_root: str, module_file: str, presto_config: presto_config_pb2.PrestoConnConfig, query: str, serving_model_dir: str, metadata_path: str) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data example_gen = PrestoExampleGen(presto_config, query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, worker_parallelism: int, runner: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) beam_common_args = [ '--environment_type=LOOPBACK', '--sdk_worker_parallelism=%d' % worker_parallelism, '--experiments=use_loopback_process_worker=True', # Setting environment_cache_millis to practically infinity enables # continual reuse of Beam SDK workers, improving performance. '--environment_cache_millis=1000000', # Note; We use 100 worker threads to mitigate the issue with # scheduling work between the Beam runner and SDK harness. Flink # and Spark can process unlimited work items concurrently while # SdkHarness can only process 1 work item per worker thread. # Having 100 threads will let 100 tasks execute concurrently # avoiding scheduling issue in most cases. In case the threads are # exhausted, beam prints the relevant message in the log. # TODO(BEAM-8151) Remove worker_threads=100 after we start using a # pylint: disable=g-bad-todo # virtually unlimited thread pool by default. '--experiments=worker_threads=100', ] if runner == 'flink': beam_pipeline_args = beam_common_args + [ '--runner=FlinkRunner', '--flink_master=localhost:8081', '--flink_submit_uber_jar', # TODO(ibzib) move these to flink.conf '--parallelism=%d' % worker_parallelism, # TODO(FLINK-10672): Obviate setting BATCH_FORCED. # pylint: disable=g-bad-todo '--execution_mode_for_batch=BATCH_FORCED', ] elif runner == 'spark': beam_pipeline_args = beam_common_args + [ # TODO(ibzib) ] return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # LINT.IfChange beam_pipeline_args=beam_pipeline_args, # LINT.ThenChange(setup/setup_beam_on_spark.sh) # LINT.ThenChange(setup/setup_beam_on_flink.sh) )