def testHasTaskDependency(self): example_gen = CsvExampleGen(input_base="data_path") statistics_gen = StatisticsGen( examples=example_gen.outputs["examples"]) p1 = pipeline.Pipeline(pipeline_name="fake_name", pipeline_root="fake_root", components=[example_gen, statistics_gen]) self.assertFalse(compiler_utils.has_task_dependency(p1)) a = EmptyComponent(name="a").with_id("a") statistics_gen.add_downstream_node(a) p2 = pipeline.Pipeline(pipeline_name="fake_name", pipeline_root="fake_root", components=[example_gen, statistics_gen, a]) self.assertTrue(compiler_utils.has_task_dependency(p2))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the imdb sentiment analysis pipline with TFX.""" examples = external_input(data_root) # Brings data in to the pipline example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, transform, ], metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), enable_cache=True, beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def _create_pipeline(pipeline_name: str, pipeline_root: str, data_root_1: str, data_root_2: str) -> pipeline.Pipeline: """Implements a pipeline with channel.union().""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen_1 = CsvExampleGen( input_base=data_root_1).with_id('example_gen_1') example_gen_2 = CsvExampleGen( input_base=data_root_2).with_id('example_gen_2') # pylint: disable=no-value-for-parameter channel_union = ChannelUnionComponent(input_data=channel.union( [example_gen_1.outputs['examples'], example_gen_2.outputs['examples']]), name='channel_union_input') # Get the latest channel. latest_artifacts_resolver = resolver.Resolver( strategy_class=latest_artifact_strategy.LatestArtifactStrategy, resolved_channels=channel.union([ example_gen_1.outputs['examples'], channel_union.outputs['output_data'] ])).with_id('latest_artifacts_resolver') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( examples=latest_artifacts_resolver.outputs['resolved_channels']) return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen_1, example_gen_2, channel_union, latest_artifacts_resolver, statistics_gen ])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the Penguin pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # TODO(humichael): Handle applying transformation component in Milestone 3. # Uses user-provided Python function that trains a model using TF-Learn. # Num_steps is not provided during evaluation because the scikit-learn model # loads and evaluates the entire test set at once. # TODO(b/159470716): Make schema optional in Trainer. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=2000), eval_args=trainer_pb2.EvalArgs()) # TODO(humichael): Add Evaluator once it's decided how to proceed with # Milestone 2. pusher = Pusher( model=trainer.outputs['model'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, trainer, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args, )
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) examples = tfrecord_input(data_root) example_gen = ImportExampleGen(input=examples, output_config=output) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=100), eval_args=trainer_pb2.EvalArgs(num_steps=50)) eval_config = tfma.EvalConfig(slicing_specs=[tfma.SlicingSpec()]) model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the imdb sentiment analysis pipline with TFX.""" examples = tfrecord_input(data_root) # Brings data in to the pipline example_gen = ImportExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model using TF-Learn. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=200), eval_args=trainer_pb2.EvalArgs(num_steps=100)) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, #model_resolver, #valuator, #pusher, ], metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), enable_cache=True, beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def generate_statistics(self): """Constructs a StatisticsGen component on the example_gen output files Returns: self """ args = {'examples': self.example_gen.outputs['examples']} if self.user_schema_importer: args['schema'] = self.user_schema_importer.outputs['result'] self.statistics_gen = StatisticsGen(**args) return self.statistics_gen
def create_pipeline(): """Implements the chicago taxi pipeline with TFX.""" examples = csv_inputs(os.path.join(base_dir, 'no_split/span_1')) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_data=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(input_data=example_gen.outputs.examples) # Generates schema based on statistics files. infer_schema = SchemaGen(stats=statistics_gen.outputs.output) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator(stats=statistics_gen.outputs.output, schema=infer_schema.outputs.output) # Performs transformations and feature engineering in training and serving. transform = Transform(input_data=example_gen.outputs.examples, schema=infer_schema.outputs.output, module_file=taxi_pipeline_utils) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=taxi_pipeline_utils, transformed_examples=transform.outputs.transformed_examples, schema=infer_schema.outputs.output, transform_output=transform.outputs.transform_output, train_steps=10000, eval_steps=5000, warm_starting=True) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs.examples, model_exports=trainer.outputs.output) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs.examples, model=trainer.outputs.output) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model_export=trainer.outputs.output, model_blessing=model_validator.outputs.blessing, serving_model_dir=serving_model_dir) return [ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ]
def re_pipe(self, csvname): self.csvname = csvname examples = csv_inputs(os.path.join(self.base_dir, self.csvname)) self.example_gen = tfx.components.example_gen.csv_example_gen.component.CsvExampleGen( input=examples) self.statistics_gen = StatisticsGen( self.example_gen.outputs['examples'], instance_name=self.csvname + "_statistics_gen") self.scheme_gen = SchemaGen( statistics=self.statistics_gen.outputs['statistics']) self.valid_stats = ExampleValidator( statistics=self.statistics_gen.outputs["statistics"], schema=self.scheme_gen.outputs["schema"]) return "done"
def create_test_pipeline(): """Creates a sample pipeline with ForEach context.""" example_gen = CsvExampleGen(input_base='/data/mydummy_dataset') with for_each.ForEach(example_gen.outputs['examples']) as each_example: statistics_gen = StatisticsGen(examples=each_example) latest_stats_resolver = resolver.Resolver( statistics=statistics_gen.outputs['statistics'], strategy_class=latest_artifact_strategy.LatestArtifactStrategy, ).with_id('latest_stats_resolver') schema_gen = SchemaGen( statistics=latest_stats_resolver.outputs['statistics']) with for_each.ForEach(example_gen.outputs['examples']) as each_example: trainer = Trainer( module_file='/src/train.py', examples=each_example, schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=2000), ) with for_each.ForEach(trainer.outputs['model']) as each_model: pusher = Pusher( model=each_model, push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/models')), ) return pipeline.Pipeline( pipeline_name='foreach', pipeline_root='/tfx/pipelines/foreach', components=[ example_gen, statistics_gen, latest_stats_resolver, schema_gen, trainer, pusher, ], enable_cache=True, execution_mode=pipeline.ExecutionMode.SYNC, )
def create_pipe(): instance = ExternalArtifact() instance.uri = str(CSV_PATH) channels = channel_utils.as_channel([instance]) example_gen = CsvExampleGen(input=channels) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics']) example_val = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/root/airflow/dags/infer.py') trainer = Trainer( module_file='/root/airflow/dags/infer.py', custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=200), eval_args=trainer_pb2.EvalArgs(num_steps=100)) # Database config metadata_config = metadata.sqlite_metadata_connection_config('teste_1.db') pipe = pipeline.Pipeline( pipeline_name='teste_1', pipeline_root='/root/', components=[ example_gen, statistics_gen, infer_schema, example_val, transform, trainer ], metadata_connection_config=metadata_config, ) return pipe
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, beam_pipeline_args: Text) -> pipeline.Pipeline: """Custom component demo pipeline.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) hello = component.HelloComponent( input_data=example_gen.outputs['examples'], name=u'HelloWorld') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=hello.outputs['output_data']) return pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, hello, statistics_gen], enable_cache=True, beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input_base=data_root) hello = component.HelloComponent( input_data=example_gen.outputs['examples'], name=u'HelloWorld') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=hello.outputs['output_data']) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, hello, statistics_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path))
def build(self, context: Context) -> BaseNode: from tfx.components import StatisticsGen statistics_artifact = standard_artifacts.ExampleStatistics() statistics_artifact.split_names = artifact_utils.encode_split_names( splits_or_example_defaults(self._config.params.split_names)) output = Channel(type=standard_artifacts.ExampleStatistics, artifacts=[statistics_artifact]) examples = context.get(self._config.inputs.examples) component = StatisticsGen( examples=examples, stats_options=None, output=output, instance_name=context.abs_current_url_friendly) put_outputs_to_context(context, self._config.outputs, component) return component
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Hyperparameter tuning based on the tuner_fn in module_file. tuner = Tuner(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # TODO(jyzhao): support trainer and following components. return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, tuner, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def create_e2e_components(csv_input_location: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Because we don't need to run whole pipeline, we will make a very short toy pipeline. Args: csv_input_location: The location of the input data directory. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ examples = dsl_utils.external_input(csv_input_location) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) return [example_gen, statistics_gen, schema_gen]
def generate_pipeline(pipeline_name, pipeline_root, data_root, train_steps, eval_steps): examples = external_input(data_root) example_gen = CsvExampleGen(input=examples) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) trainer = Trainer( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file='util.py', # util.py is a file in the same folder train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps)) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen, statistics_gen, schema_gen, trainer], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( os.path.join(pipeline_root, 'metadata.sqlite')))
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, module_file_lite: Text, serving_model_dir: Text, serving_model_dir_lite: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" # Brings data into the pipeline. example_gen = ImportExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) def _create_trainer(module_file, instance_name): return Trainer(module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec( GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100), instance_name=instance_name) # Uses user-provided Python function that trains a Keras model. trainer = _create_trainer(module_file, 'mnist') # Trains the same model as the one above, but converts it into a TFLite one. trainer_lite = _create_trainer(module_file_lite, 'mnist_lite') # TODO(b/150949276): Add resolver back once it supports two trainers. # Uses TFMA to compute an evaluation statistics over features of a model and # performs quality validation of a candidate model. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image_class')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.8}))) ]) ]) eval_config_lite = tfma.EvalConfig() eval_config_lite.CopyFrom(eval_config) # Informs the evaluator that the model is a TFLite model. eval_config_lite.model_specs[0].model_type = 'tf_lite' # Uses TFMA to compute the evaluation statistics over features of a model. evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, instance_name='mnist') # Uses TFMA to compute the evaluation statistics over features of a TFLite # model. evaluator_lite = Evaluator(examples=example_gen.outputs['examples'], model=trainer_lite.outputs['model'], eval_config=eval_config_lite, instance_name='mnist_lite') # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), instance_name='mnist') # Checks whether the TFLite model passed the validation steps and pushes the # model to a file destination if check passed. pusher_lite = Pusher(model=trainer_lite.outputs['model'], model_blessing=evaluator_lite.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir_lite)), instance_name='mnist_lite') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, trainer_lite, evaluator, evaluator_lite, pusher, pusher_lite, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # query: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" components = [] # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=external_input(data_path)) # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # example_gen = BigQueryExampleGen(query=query) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline. # components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # TODO(step 5): Uncomment here to add SchemaGen to the pipeline. # components.append(schema_gen) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # TODO(step 5): Uncomment here to add ExampleValidator to the pipeline. # components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], preprocessing_fn=preprocessing_fn) # TODO(step 6): Uncomment here to add Transform to the pipeline. # components.append(transform) # Uses user-provided Python function that implements a model using TF-Learn. trainer_args = { 'run_fn': run_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': schema_gen.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor), } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) # TODO(step 6): Uncomment here to add Trainer to the pipeline. # components.append(trainer) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # TODO(step 6): Uncomment here to add ResolverNode to the pipeline. # components.append(model_resolver) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='tips')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='BinaryAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': eval_accuracy_threshold}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # TODO(step 6): Uncomment here to add Evaluator to the pipeline. # components.append(evaluator) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher_args = { 'model': trainer.outputs['model'], 'model_blessing': evaluator.outputs['blessing'], 'push_destination': pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), } if ai_platform_serving_args is not None: pusher_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), 'custom_config': { ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }, }) pusher = Pusher(**pusher_args) # pylint: disable=unused-variable # TODO(step 6): Uncomment here to add Pusher to the pipeline. # components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, # TODO(step 8): Change this value to control caching of execution results. enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" #examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. #example_gen = CsvExampleGen(input=examples) example_gen = CsvExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def generate_pipeline(pipeline_name, pipeline_root, train_data, test_data, train_steps, eval_steps, pusher_target, runner): module_file = 'util.py' # util.py is a file in the same folder # RuntimeParameter is only supported on KubeflowDagRunner currently if runner == 'kubeflow': pipeline_root_param = os.path.join('gs://{{kfp-default-bucket}}', pipeline_name, '{{workflow.uid}}') train_data_param = data_types.RuntimeParameter( name='train-data', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/train', ptype=Text) test_data_param = data_types.RuntimeParameter( name='test-data', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/test', ptype=Text) pusher_target_param = data_types.RuntimeParameter( name='pusher-destination', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/serving', ptype=Text) else: pipeline_root_param = pipeline_root train_data_param = train_data test_data_param = test_data pusher_target_param = pusher_target examples = external_input(train_data_param) example_gen = CsvExampleGen(input=examples, instance_name="train") test_examples = external_input(test_data_param) test_example_gen = CsvExampleGen(input=test_examples, output_config={ 'split_config': { 'splits': [{ 'name': 'test', 'hash_buckets': 1 }] } }, instance_name="test") statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True ) # infer_feature_shape controls sparse or dense # Transform is too slow in my side. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], module_file=module_file, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps), instance_name="train", enable_cache=False) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='target')], # tfma.SlicingSpec(feature_keys=['var_0', 'var_1']) when add more, Evaluator can't ouptput BLESSED status. It should be a bug in TFMA. slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.4}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], # baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config, instance_name="eval5") # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination={ 'filesystem': { 'base_directory': pusher_target_param } }) bulk_inferrer = BulkInferrer( examples=test_example_gen.outputs['examples'], model=trainer.outputs['model'], # model_blessing=evaluator.outputs['blessing'], data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec(), instance_name="bulkInferrer") hello = component.HelloComponent( input_data=bulk_inferrer.outputs['inference_result'], instance_name='csvGen') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root_param, components=[ example_gen, statistics_gen, schema_gen, transform, trainer, model_resolver, evaluator, pusher, hello, test_example_gen, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( os.path.join(pipeline_root, 'metadata.sqlite')), beam_pipeline_args=['--direct_num_workers=0'])
def _create_pipeline(pipeline_name: str, pipeline_root: str, module_file: str, presto_config: presto_config_pb2.PrestoConnConfig, query: str, serving_model_dir: str, metadata_path: str) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings data into the pipeline or otherwise joins/converts training data example_gen = PrestoExampleGen(presto_config, query=query) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen(input_base=training_data_root, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=training_example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator(examples=training_example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_data_root, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='unlabelled', hash_buckets=100) ])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" training_examples = external_input(training_data_root) # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen( input_base=training_examples, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=training_example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=training_example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=training_example_gen.outputs['examples'], model=trainer.outputs['model']) inference_examples = external_input(inference_data_root) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_examples, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[example_gen_pb2.SplitConfig.Split( name='unlabelled', hash_buckets=100)])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the imdb sentiment analysis pipline with TFX.""" output = example_gen_pb2.Output(split_config=example_gen_pb2.SplitConfig( splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=9), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) # Brings data in to the pipline example_gen = CsvExampleGen(input_base=data_root, output_config=output) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=500), eval_args=trainer_pb2.EvalArgs(num_steps=200)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='label')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='BinaryAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( # Increase this threshold when training on complete # dataset. lower_bound={'value': 0.01}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-2}))) ]) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) components = [ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher, ] return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), enable_cache=True, beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the Iris flowers pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model using TF-Learn. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=2000), eval_args=trainer_pb2.EvalArgs(num_steps=5)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute an evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='variety')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # Performs infra validation of a candidate model to prevent unservable model # from being pushed. This config will launch a model server of the latest # TensorFlow Serving image in a local docker engine. infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), local_docker=infra_validator_pb2.LocalDockerConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], infra_blessing=infra_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, infra_validator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ) -> pipeline.Pipeline: """Implements the penguin pipeline with TFX.""" components = [] # Brings data into the pipeline or otherwise joins/converts training data. # TODO(step 2): Might use another ExampleGen class for your data. example_gen = CsvExampleGen(input_base=data_path) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) components.append(schema_gen) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform( # pylint: disable=unused-variable examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], preprocessing_fn=preprocessing_fn) # TODO(step 3): Uncomment here to add Transform to the pipeline. # components.append(transform) # Uses user-provided Python function that implements a model using Tensorflow. trainer = Trainer( run_fn=run_fn, examples=example_gen.outputs['examples'], # Use outputs of Transform as training inputs if Transform is used. # examples=transform.outputs['transformed_examples'], # transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=train_args, eval_args=eval_args) # TODO(step 4): Uncomment here to add Trainer to the pipeline. # components.append(trainer) # Get the latest blessed model for model validation. model_resolver = resolver.Resolver( strategy_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel( type=ModelBlessing)).with_id('latest_blessed_model_resolver') # TODO(step 5): Uncomment here to add Resolver to the pipeline. # components.append(model_resolver) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key=features.LABEL_KEY)], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': eval_accuracy_threshold}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( # pylint: disable=unused-variable examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # TODO(step 5): Uncomment here to add Evaluator to the pipeline. # components.append(evaluator) # Pushes the model to a file destination if check passed. pusher = Pusher( # pylint: disable=unused-variable model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) # TODO(step 5): Uncomment here to add Pusher to the pipeline. # components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, # Change this value to control caching of execution results. Default value # is `False`. # enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def testTaxiPipelineNewStyleCompatibility(self): examples = external_input('/tmp/fake/path') example_gen = CsvExampleGen(input=examples) self.assertIs(example_gen.inputs['input'], example_gen.inputs['input_base']) statistics_gen = StatisticsGen( examples=example_gen.outputs['examples']) self.assertIs(statistics_gen.inputs['examples'], statistics_gen.inputs['input_data']) infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics']) self.assertIs(infer_schema.inputs['statistics'], infer_schema.inputs['stats']) self.assertIs(infer_schema.outputs['schema'], infer_schema.outputs['output']) validate_examples = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) self.assertIs(validate_examples.inputs['statistics'], validate_examples.inputs['stats']) self.assertIs(validate_examples.outputs['anomalies'], validate_examples.outputs['output']) transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file='/tmp/fake/module/file') self.assertIs(transform.inputs['examples'], transform.inputs['input_data']) self.assertIs(transform.outputs['transform_graph'], transform.outputs['transform_output']) trainer = Trainer( module_file='/tmp/fake/module/file', transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) self.assertIs(trainer.inputs['transform_graph'], trainer.inputs['transform_output']) self.assertIs(trainer.outputs['model'], trainer.outputs['output']) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) self.assertIs(evaluator.inputs['model'], evaluator.inputs['model_exports']) self.assertIs(evaluator.outputs['evaluation'], evaluator.outputs['output']) model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) pusher = Pusher(model=trainer.outputs['output'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory='/fake/serving/dir'))) self.assertIs(pusher.inputs['model'], pusher.inputs['model_export']) self.assertIs(pusher.outputs['pushed_model'], pusher.outputs['model_push'])
def create_e2e_components( pipeline_root: Text, csv_input_location: Text, transform_module: Text, trainer_module: Text, ) -> List[BaseComponent]: """Creates components for a simple Chicago Taxi TFX pipeline for testing. Args: pipeline_root: The root of the pipeline output. csv_input_location: The location of the input data directory. transform_module: The location of the transform module file. trainer_module: The location of the trainer module file. Returns: A list of TFX components that constitutes an end-to-end test pipeline. """ example_gen = CsvExampleGen(input_base=csv_input_location) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=transform_module) latest_model_resolver = resolver.Resolver( strategy_class=latest_artifact_strategy.LatestArtifactStrategy, latest_model=Channel(type=Model)).with_id('latest_model_resolver') trainer = Trainer( transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], base_model=latest_model_resolver.outputs['latest_model'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), module_file=trainer_module, ) # Set the TFMA config for Model Evaluation and Validation. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], metrics_specs=[ tfma.MetricsSpec( metrics=[tfma.MetricConfig(class_name='ExampleCount')], thresholds={ 'accuracy': tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.5}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) infra_validator = InfraValidator( model=trainer.outputs['model'], examples=example_gen.outputs['examples'], serving_spec=infra_validator_pb2.ServingSpec( tensorflow_serving=infra_validator_pb2.TensorFlowServing( tags=['latest']), kubernetes=infra_validator_pb2.KubernetesConfig()), request_spec=infra_validator_pb2.RequestSpec( tensorflow_serving=infra_validator_pb2. TensorFlowServingRequestSpec())) pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(pipeline_root, 'model_serving')))) return [ example_gen, statistics_gen, schema_gen, example_validator, transform, latest_model_resolver, trainer, evaluator, infra_validator, pusher, ]