input_config=input_config) return span_example # %% if __name__ == '__main__': context = InteractiveContext(pipeline_root=config.PIPELINE_ROOT) # %% complaint_df = pd.read_csv(config.DATA_FILE_PATH, encoding='utf-8') # %% #ImportExampleGen with TFRecord complaint_tfrecord = tfrecord_data_writer(file_path=config.DATA_FILE_PATH) example_gen = ImportExampleGen(input_base=config.RECORD_DIR_PATH) context.run(example_gen) # %% #Plain simple csv file for CsvExampleGen example_gen = CsvExampleGen(input_base=config.DATA_DIR_PATH) context.run(example_gen) # %% #Data Split split_example_gen = data_split(file_path=config.DATA_SPLITS_DIR_PATH) context.run(split_example_gen) # %% #Existing Data Split #Won't run through as there is no train folder
def build_pipeline(timestamp: str) -> pipeline: """ Gather tfx components and produce the output pipeline """ conf['beam']['serving_model_dir'] = f"{conf['beam']['serving_model_dir']}/beam/OL{653374}/{timestamp}" conf['beam']['pipeline_root_dir'] = f"{conf['beam']['pipeline_root_dir']}/beam/OL{653374}/{timestamp}" conf['beam']['metadata_path'] = f"{conf['beam']['metadata_path']}/beam/OL{653374}" logging.info("Serving model dir is now %s",conf['beam']['serving_model_dir']) example_gen = ImportExampleGen(input_base=conf['train_data']) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False ) transform = Transform( examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=conf['trainer_module_file'] ) example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema'] ) trainer = Trainer( examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], module_file=conf['trainer_module_file'], custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), # define this to use run_fn instead of trainer_fn train_args=trainer_pb2.TrainArgs(num_steps=conf['train_args_steps']), eval_args=trainer_pb2.EvalArgs(num_steps=50) ) metrics = [ tfma.metrics.ExampleCount(name='example_count'), tfma.metrics.WeightedExampleCount(name='weighted_example_count'), tf.keras.metrics.BinaryCrossentropy(name='binary_crossentropy'), tf.keras.metrics.BinaryAccuracy(name='accuracy'), tf.keras.metrics.AUC(name='auc', num_thresholds=10), tf.keras.metrics.AUC( name='auc_precision_recall', curve='PR', num_thresholds=100), tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tfma.metrics.MeanLabel(name='mean_label'), tfma.metrics.MeanPrediction(name='mean_prediction'), tfma.metrics.Calibration(name='calibration'), tfma.metrics.ConfusionMatrixPlot(name='confusion_matrix_plot'), tfma.metrics.CalibrationPlot(name='calibration_plot') ] my_metrics_specs = tfma.metrics.specs_from_metrics(metrics) eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(label_key='label') ], metrics_specs=my_metrics_specs # [ # tfma.MetricsSpec( # metrics=[ # # tfma.MetricConfig(class_name='ExampleCount'), # tfma.MetricConfig(class_name='BinaryAccuracy', # threshold=tfma.MetricThreshold( # value_threshold=tfma.GenericValueThreshold( # lower_bound={'value': 0.5}), # change_threshold=tfma.GenericChangeThreshold( # direction=tfma.MetricDirection.HIGHER_IS_BETTER, # absolute={'value': -1e-10}))) # ] # ) # ], , slicing_specs=[ tfma.SlicingSpec(), ]) model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=conf['beam']['serving_model_dir']))) components = [ example_gen, statistics_gen, schema_gen, transform, example_validator, trainer, model_resolver, evaluator, pusher ] tfx_pipeline = pipeline.Pipeline( pipeline_name=conf['beam']['pipeline_name'], pipeline_root=conf['beam']['pipeline_root_dir'], components=components, enable_cache=False, metadata_connection_config=( metadata.sqlite_metadata_connection_config(conf['beam']['metadata_path']) ) ) return tfx_pipeline
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, module_file_lite: Text, serving_model_dir: Text, serving_model_dir_lite: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" # Brings data into the pipeline. example_gen = ImportExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) def _create_trainer(module_file, instance_name): return Trainer(module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec( GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100), instance_name=instance_name) # Uses user-provided Python function that trains a Keras model. trainer = _create_trainer(module_file, 'mnist') # Trains the same model as the one above, but converts it into a TFLite one. trainer_lite = _create_trainer(module_file_lite, 'mnist_lite') # TODO(b/150949276): Add resolver back once it supports two trainers. # Uses TFMA to compute an evaluation statistics over features of a model and # performs quality validation of a candidate model. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image_class')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.8}))) ]) ]) eval_config_lite = tfma.EvalConfig() eval_config_lite.CopyFrom(eval_config) # Informs the evaluator that the model is a TFLite model. eval_config_lite.model_specs[0].model_type = 'tf_lite' # Uses TFMA to compute the evaluation statistics over features of a model. evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config, instance_name='mnist') # Uses TFMA to compute the evaluation statistics over features of a TFLite # model. evaluator_lite = Evaluator(examples=example_gen.outputs['examples'], model=trainer_lite.outputs['model'], eval_config=eval_config_lite, instance_name='mnist_lite') # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), instance_name='mnist') # Checks whether the TFLite model passed the validation steps and pushes the # model to a file destination if check passed. pusher_lite = Pusher(model=trainer_lite.outputs['model'], model_blessing=evaluator_lite.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir_lite)), instance_name='mnist_lite') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, trainer_lite, evaluator, evaluator_lite, pusher, pusher_lite, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]): """Creates pipeline.""" pipeline_root = os.path.join(pipeline_root, 'pipelines', pipeline_name) example_gen = ImportExampleGen( input_base=data_root, # IMPORTANT: must set FORMAT_PROTO payload_format=example_gen_pb2.FORMAT_PROTO) data_view_provider = provider_component.TfGraphDataViewProvider( module_file=module_file, create_decoder_func='make_decoder') data_view_binder = binder_component.DataViewBinder( example_gen.outputs['examples'], data_view_provider.outputs['data_view']) statistics_gen = StatisticsGen( examples=data_view_binder.outputs['output_examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics']) transform = Transform( examples=data_view_binder.outputs['output_examples'], schema=schema_gen.outputs['schema'], module_file=module_file, # important: must disable Transform materialization. materialize=False) trainer = Trainer( examples=data_view_binder.outputs['output_examples'], transform_graph=transform.outputs['transform_graph'], module_file=module_file, train_args=trainer_pb2.TrainArgs(num_steps=1000), schema=schema_gen.outputs['schema'], eval_args=trainer_pb2.EvalArgs(num_steps=10)) eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec( signature_name='', label_key='relevance', padding_options=tfma.config.PaddingOptions( label_float_padding=-1.0, prediction_float_padding=-1.0)) ], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['query_tokens']), ], metrics_specs=[ tfma.MetricsSpec( per_slice_thresholds={ 'metric/ndcg_10': tfma.config.PerSliceMetricThresholds(thresholds=[ tfma.PerSliceMetricThreshold( # The overall slice. slicing_specs=[tfma.SlicingSpec()], threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}))) ]) }) ]) evaluator = Evaluator( examples=data_view_binder.outputs['output_examples'], model=trainer.outputs['model'], eval_config=eval_config, schema=schema_gen.outputs['schema']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, data_view_provider, data_view_binder, statistics_gen, schema_gen, transform, trainer, evaluator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir_lite: Text, metadata_path: Text, labels_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the CIFAR10 image classification pipeline using TFX.""" # This is needed for datasets with pre-defined splits # Change the pattern argument to train_whole/* and test_whole/* to train # on the whole CIFAR-10 dataset input_config = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train/*'), example_gen_pb2.Input.Split(name='eval', pattern='test/*') ]) # Brings data into the pipeline. example_gen = ImportExampleGen(input_base=data_root, input_config=input_config) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model. # When traning on the whole dataset, use 18744 for train steps, 156 for eval # steps. 18744 train steps correspond to 24 epochs on the whole train set, and # 156 eval steps correspond to 1 epoch on the whole test set. The # configuration below is for training on the dataset we provided in the data # folder, which has 128 train and 128 test samples. The 160 train steps # correspond to 40 epochs on this tiny train set, and 4 eval steps correspond # to 1 epoch on this tiny test set. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=160), eval_args=trainer_pb2.EvalArgs(num_steps=4), custom_config={'labels_path': labels_path}) # Get the latest blessed model for model validation. model_resolver = ResolverNode( resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel( type=ModelBlessing)).with_id('latest_blessed_model_resolver') # Uses TFMA to compute evaluation statistics over features of a model and # perform quality validation of a candidate model (compare to a baseline). eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(label_key='label_xf', model_type='tf_lite') ], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.55}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-3}))) ]) ]) # Uses TFMA to compute the evaluation statistics over features of a model. # We evaluate using the materialized examples that are output by Transform # because # 1. the decoding_png function currently performed within Transform are not # compatible with TFLite. # 2. MLKit requires deserialized (float32) tensor image inputs # Note that for deployment, the same logic that is performed within Transform # must be reproduced client-side. evaluator = Evaluator(examples=transform.outputs['transformed_examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir_lite))) components = [ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ] return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def create_pipeline( pipeline_name: Text, pipeline_root: Path, module_file: Path, serving_model_path: Path, metadata_path: Path, data_path: Path, ) -> pipeline.Pipeline: builder = Gta1() builder.download_and_prepare() input_config = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name="train", pattern="*.tfrecord-[0-9]*"), ], ) output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=9), example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1), ], ), ) # Bring the data in to the pipeline. example_gen = ImportExampleGen( input_base=builder.data_dir, input_config=input_config, output_config=output_config, ) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"]) # Generates schema based on statistics files. schema_gen = SchemaGen( statistics=statistics_gen.outputs["statistics"], infer_feature_shape=True, ) # Performs transformations and feature engineering in training and serving. transform = Transform( module_file=str(module_file), examples=example_gen.outputs["examples"], schema=schema_gen.outputs["schema"], materialize=True, ) # Uses user-provided Python function that trains a model. trainer = Trainer( module_file=str(module_file), examples=transform.outputs["transformed_examples"], transform_graph=transform.outputs["transform_graph"], schema=schema_gen.outputs["schema"], train_args=trainer_pb2.TrainArgs(num_steps=10_000), eval_args=trainer_pb2.EvalArgs(num_steps=500), ) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs["model"], # model_blessing=evaluator.outputs["blessing"], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=str(serving_model_path), ), ), ) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=str(pipeline_root), components=[ example_gen, statistics_gen, schema_gen, transform, trainer, pusher, ], metadata_connection_config=metadata.sqlite_metadata_connection_config( str(metadata_path), ), enable_cache=True, )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, enable_cache: bool, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, trainer_custom_config: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: components = [] # Brings data into the pipeline or otherwise joins/converts training data. example_gen = ImportExampleGen(input=external_input(data_path)) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) components.append(schema_gen) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], preprocessing_fn=preprocessing_fn) components.append(transform) # Uses user-provided Python function that implements a model using TF-Learn. trainer_args = { 'run_fn': run_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': schema_gen.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor), 'custom_config': trainer_custom_config, } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) components.append(trainer) # TODO in TFX <= 2.22.0 we need a workaround to enable the pusher. Pusher is disabled until we move sample to > # TFX==2.22.00 # # pusher_args = { # 'model': trainer.outputs['model'], # 'model_blessing': blessing_importer.outputs['result'], # 'push_destination': pusher_pb2.PushDestination( # filesystem=pusher_pb2.PushDestination.Filesystem( # base_directory=serving_model_dir)), # } # if ai_platform_serving_args is not None: # pusher_args.update({ # 'custom_executor_spec': executor_spec.ExecutorClassSpec( # ai_platform_pusher_executor.Executor), # 'custom_config': { # ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args # }, # }) # pusher = Pusher(**pusher_args) # pylint: disable=unused-variable # Temporary disable pusher. # components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=enable_cache, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the cifar10 pipeline with TFX.""" examples = external_input(data_root) input_split = example_gen_pb2.Input(splits=[ example_gen_pb2.Input.Split(name='train', pattern='train.tfrecord'), example_gen_pb2.Input.Split(name='eval', pattern='test.tfrecord') ]) example_gen = ImportExampleGen(input=examples, input_config=input_split) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer(module_file=module_file, examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=1000), eval_args=trainer_pb2.EvalArgs(num_steps=500)) # Uses TFMA to compute a evaluation statistics over features of a model. evaluator = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec( specs=[evaluator_pb2.SingleSlicingSpec()])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, evaluator, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" examples = external_input(data_root) # Brings data into the pipeline. example_gen = ImportExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model using TF-Learn. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute an evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image_class')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'sparse_categorical_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.8}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_analyzer.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_resolver, model_analyzer, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/142684737): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # query: Text, # preprocessing_fn: Text, # run_fn: Text, module_file: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" components = [] # Brings data into the pipeline or otherwise joins/converts training data. # example_gen = CsvExampleGen(input=external_input(data_path)) example_gen = ImportExampleGen(input=external_input(data_path)) # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # example_gen = BigQueryExampleGen(query=query) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # TODO(step 5): Uncomment here to add StatisticsGen to the pipeline. components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # TODO(step 5): Uncomment here to add SchemaGen to the pipeline. components.append(schema_gen) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) components.append(transform) # Uses user-provided Python function that implements a model using TF-Learn. trainer_args = { 'module_file': module_file, # 'examples': example_gen.outputs['examples'], 'transformed_examples': transform.outputs['transformed_examples'], 'schema': schema_gen.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor), } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) # TODO(step 6): Uncomment here to add Trainer to the pipeline. components.append(trainer) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # TODO(step 6): Uncomment here to add ResolverNode to the pipeline. components.append(model_resolver) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='label')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='BinaryAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': eval_accuracy_threshold}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10}))) ]) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config) # TODO(step 6): Uncomment here to add Evaluator to the pipeline. components.append(evaluator) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher_args = { 'model': trainer.outputs['model'], 'model_blessing': evaluator.outputs['blessing'], 'push_destination': pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir)), } if ai_platform_serving_args is not None: pusher_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_pusher_executor.Executor), 'custom_config': { ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args }, }) pusher = Pusher(**pusher_args) # pylint: disable=unused-variable # TODO(step 6): Uncomment here to add Pusher to the pipeline. components.append(pusher) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, # TODO(step 8): Change this value to control caching of execution results. enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, module_file: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" examples = external_input(data_root) # Brings data into the pipeline. example_gen = ImportExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a model using TF-Learn. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=infer_schema.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image/class')], slicing_specs=[tfma.SlicingSpec()])) # TODO(ananthr): support infra validator, model validation in evaluator, # and pusher component. return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), )
def create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, test_data_root: Text, module_file: Text, serving_model_dir: Text, enable_cache: bool, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None): """create pipeline Args: pipeline_name (Text): pipeline name pipeline_root (Text): pipeline root path data_root (Text): input data path test_data_root (Text): test data path module_file (Text): Python module files to inject customized logic into the TFX components. serving_model_dir (Text): output directory path enable_cache (bool): Whether to use the cache or not metadata_connection_config (Optional[ metadata_store_pb2.ConnectionConfig], optional): [description]. Defaults to None. beam_pipeline_args (Optional[List[Text]], optional): [description]. Defaults to None. Returns: [type]: [description] """ # train testで分かれているtfrecordを指定 output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=8), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2), ])) # パイプラインにデータをロード example_gen = ImportExampleGen(input_base=data_root, output_config=output_config, instance_name="train_data") test_example_gen = ImportExampleGen(input_base=test_data_root, instance_name="test_data") # データの統計量を計算 statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # staticsGenの統計ファイルからスキーマを生成 schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # データに欠損などがないかを検査 example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=160), eval_args=trainer_pb2.EvalArgs(num_steps=4), ) model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # https://github.com/tensorflow/tfx/issues/3016 eval_config = tfma.EvalConfig( model_specs=[ tfma.ModelSpec(label_key='label', model_type='tf_keras', signature_name="serving_default") ], slicing_specs=[ tfma.SlicingSpec(), ], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.2}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-3}))) ]) ]) evaluator = Evaluator(examples=test_example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) components = [ example_gen, test_example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, pusher ] return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, enable_cache=enable_cache, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, query: Optional[Text] = None, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: if query: example_gen = BigQueryExampleGen(query=query) else: # example_gen = CsvExampleGen(input=external_input(data_path)) example_gen = ImportExampleGen(input=external_input(data_path)) statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) example_validator = ExampleValidator(statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], preprocessing_fn=preprocessing_fn) trainer_args = { 'run_fn': run_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': schema_gen.outputs['schema'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor), } if ai_platform_training_args: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) # model_resolver = ResolverNode(instance_name='latest_blessed_model_resolver', # resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, # model=Channel(type=Model), # model_blessing=Channel(type=ModelBlessing)) # eval_config = tfma.EvalConfig( # model_specs=[tfma.ModelSpec(label_key='tips')], # slicing_specs=[tfma.SlicingSpec()], # metrics_specs=[ # tfma.MetricsSpec( # thresholds={ # 'binary_accuracy': # tfma.config.MetricThreshold( # value_threshold=tfma.GenericValueThreshold( # lower_bound={'value': eval_accuracy_threshold}), # change_threshold=tfma.GenericChangeThreshold( # direction=tfma.MetricDirection.HIGHER_IS_BETTER, # absolute={'value': -1e-10})) # }) # ]) # evaluator = Evaluator(examples=example_gen.outputs['examples'], # model=trainer.outputs['model'], # baseline_model=model_resolver.outputs['model'], # eval_config=eval_config) # pusher_args = { # 'model': # trainer.outputs['model'], # 'model_blessing': # evaluator.outputs['blessing'], # 'push_destination': # pusher_pb2.PushDestination( # filesystem=pusher_pb2.PushDestination.Filesystem( # base_directory=serving_model_dir)), # } # if ai_platform_serving_args: # pusher_args.update({ # 'custom_executor_spec': # executor_spec.ExecutorClassSpec( # ai_platform_pusher_executor.Executor), # 'custom_config': { # ai_platform_pusher_executor.SERVING_ARGS_KEY: # ai_platform_serving_args # }, # }) # pusher = Pusher(**pusher_args) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, # trainer, # model_resolver, # evaluator, # pusher ], enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, custom_config: Dict[Text, Any], module_file: Text, serving_model_dir: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the handwritten digit classification example using TFX.""" # Store the configuration along with the pipeline run so results can be reproduced pipeline_configuration = FromCustomConfig(custom_config=custom_config) # Brings data into the pipeline. example_gen = ImportExampleGen(input_base=data_root) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Create a filtered dataset - today we only want a model for small digits filter = Filter(examples=example_gen.outputs['examples'], pipeline_configuration=pipeline_configuration. outputs['pipeline_configuration'], splits_to_transform=['train', 'eval'], splits_to_copy=[]) # Create a stratified dataset for evaluation stratified_examples = StratifiedSampler( examples=filter.outputs['filtered_examples'], pipeline_configuration=pipeline_configuration. outputs['pipeline_configuration'], samples_per_key=1200, splits_to_transform=['eval'], splits_to_copy=['train']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=filter.outputs['filtered_examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that trains a Keras model. trainer = Trainer( module_file=module_file, custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), custom_config=custom_config, examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], train_args=trainer_pb2.TrainArgs(num_steps=5000), eval_args=trainer_pb2.EvalArgs(num_steps=100)).with_id(u'trainer') # Uses TFMA to compute evaluation statistics over features of a model and # performs quality validation of a candidate model. eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='image_class')], slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec(metrics=[ tfma.MetricConfig( class_name='SparseCategoricalAccuracy', threshold=tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.8}))) ]) ]) # Uses TFMA to compute the evaluation statistics over features of a model. evaluator = Evaluator( examples=stratified_examples.outputs['stratified_examples'], model=trainer.outputs['model'], eval_config=eval_config).with_id(u'evaluator') # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))).with_id(u'pusher') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ pipeline_configuration, example_gen, filter, stratified_examples, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, pusher, ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def create_pipeline(pipeline_name: Text, pipeline_root: Text, metadata_path: Text) -> Pipeline: # Read the dataset and split to train / eval output_config = example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=4), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ])) examples = tfrecord_input(DATA_PATH) example_gen = ImportExampleGen(input=examples, output_config=output_config) # Generate dataset statistics statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Generate schema based on statistics schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) # Validate data and perform anomaly detection example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Feature engineering transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=TRANSFORM_MODULE) trainer = Trainer( module_file=TRAINER_MODULE, examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), train_args=trainer_pb2.TrainArgs(num_steps=200), eval_args=trainer_pb2.EvalArgs(num_steps=35)) model_spec = tfma.ModelSpec(label_key=LABEL_KEY) slicing_spec = tfma.SlicingSpec() value_threshold = tfma.GenericValueThreshold(upper_bound={'value': 0.7}) threshold = tfma.MetricThreshold(value_threshold=value_threshold) metric_config = tfma.MetricConfig(class_name='MeanAbsoluteError', threshold=threshold) metrics_spec = tfma.MetricsSpec(metrics=[metric_config]) eval_config = tfma.EvalConfig(model_specs=[model_spec], slicing_specs=[slicing_spec], metrics_specs=[metrics_spec]) evaluator = Evaluator(examples=example_gen.outputs['examples'], model=trainer.outputs['model'], eval_config=eval_config) filesystem = pusher_pb2.PushDestination.Filesystem( base_directory=SERVING_MODEL_DIR) push_destination = pusher_pb2.PushDestination(filesystem=filesystem) pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination=push_destination) pipeline = Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), components=[ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, evaluator, pusher ], enable_cache=True, beam_pipeline_args=['--direct_num_workers=0']) return pipeline
def create_pipeline( pipeline_name: Text, pipeline_root: Text, data_path: Text, # TODO(step 7): (Optional) Uncomment here to use BigQuery as a data source. # query: Text, preprocessing_fn: Text, run_fn: Text, train_args: trainer_pb2.TrainArgs, eval_args: trainer_pb2.EvalArgs, eval_accuracy_threshold: float, serving_model_dir: Text, metadata_connection_config: Optional[ metadata_store_pb2.ConnectionConfig] = None, beam_pipeline_args: Optional[List[Text]] = None, ai_platform_training_args: Optional[Dict[Text, Text]] = None, ai_platform_serving_args: Optional[Dict[Text, Any]] = None, ) -> pipeline.Pipeline: """Implements the Centernet pipeline with TFX.""" components = [] output_config = example_gen_pb2.Output( split_config=example_gen_pb2. SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=3), example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=1) ], partition_feature_name='image/filename')) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = ImportExampleGen(input=external_input(data_path), output_config=output_config) components.append(example_gen) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'], stats_options=STATS_OPTIONS) components.append(statistics_gen) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) components.append(schema_gen) # Import manually crafted schema importer_node = ImporterNode( instance_name='import_user_schema', source_uri="gs://raw_data_layer/schema/", artifact_type=tfx.types.standard_artifacts.Schema) components.append(importer_node) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( # pylint: disable=unused-variable statistics=statistics_gen.outputs['statistics'], schema=importer_node.outputs['result']) components.append(example_validator) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=importer_node.outputs['result'], preprocessing_fn=preprocessing_fn) components.append(transform) # update training_args per once use. trainer_args = { 'run_fn': run_fn, 'transformed_examples': transform.outputs['transformed_examples'], 'schema': importer_node.outputs['result'], 'transform_graph': transform.outputs['transform_graph'], 'train_args': train_args, 'eval_args': eval_args, 'custom_executor_spec': executor_spec.ExecutorClassSpec(trainer_executor.GenericExecutor), } if ai_platform_training_args is not None: trainer_args.update({ 'custom_executor_spec': executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.GenericExecutor), 'custom_config': { ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, } }) trainer = Trainer(**trainer_args) components.append(trainer) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=components, # Change this value to control caching of execution results. Default value # is `False`. enable_cache=True, metadata_connection_config=metadata_connection_config, beam_pipeline_args=beam_pipeline_args, )