def setUp(self): super(KubeflowDataflowIntegrationTest, self).setUp() # Example artifacts for testing. self.raw_examples_importer = ImporterNode( instance_name='raw_examples', source_uri=os.path.join(self._testdata_root, 'csv_example_gen'), artifact_type=standard_artifacts.Examples, reimport=True, properties={'split_names': '["train", "eval"]'}) # Schema artifact for testing. self.schema_importer = ImporterNode( instance_name='schema', source_uri=os.path.join(self._testdata_root, 'schema_gen'), artifact_type=standard_artifacts.Schema, reimport=True) # Model artifact for testing. self.model_1_importer = ImporterNode( instance_name='model_1', source_uri=os.path.join(self._testdata_root, 'trainer', 'previous'), artifact_type=standard_artifacts.Model, reimport=True)
def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Creates an inference pipeline out of TFX components. A inference pipeline is used to run a batch of data through a ML model via the BulkInferrer TFX component. Args: config: Dict. Contains a ZenML configuration used to build the data pipeline. Returns: A list of TFX components making up the data pipeline. """ component_list = [] # Load from model_uri model = ImporterNode( instance_name=GDPComponent.Trainer.name, source_uri=self.model_uri, artifact_type=standard_artifacts.Model) model_result = model.outputs.result deployer: BaseDeployerStep = \ self.steps_dict[keys.TrainingSteps.DEPLOYER] pusher_config = deployer._build_pusher_args() pusher_executor_spec = deployer._get_executor_spec() pusher = Pusher(model_export=model_result, custom_executor_spec=pusher_executor_spec, **pusher_config).with_id( GDPComponent.Deployer.name) component_list.extend([model, pusher]) return component_list
def setUp(self): super(KubeflowGCPIntegrationTest, self).setUp() # Example artifacts for testing. self.raw_examples_importer = ImporterNode( instance_name='raw_examples', source_uri=os.path.join(self._testdata_root, 'csv_example_gen'), artifact_type=standard_artifacts.Examples, reimport=True, properties={'split_names': '["train", "eval"]'}) # Transformed Example artifacts for testing. self.transformed_examples_importer = ImporterNode( instance_name='transformed_examples', source_uri=os.path.join(self._testdata_root, 'transform', 'transformed_examples'), artifact_type=standard_artifacts.Examples, reimport=True, properties={'split_names': '["train", "eval"]'}) # Schema artifact for testing. self.schema_importer = ImporterNode( instance_name='schema', source_uri=os.path.join(self._testdata_root, 'schema_gen'), artifact_type=standard_artifacts.Schema, reimport=True) # TransformGraph artifact for testing. self.transform_graph_importer = ImporterNode( instance_name='transform_graph', source_uri=os.path.join(self._testdata_root, 'transform', 'transform_output'), artifact_type=standard_artifacts.TransformGraph, reimport=True) # Model artifact for testing. self.model_1_importer = ImporterNode( instance_name='model_1', source_uri=os.path.join(self._testdata_root, 'trainer', 'previous'), artifact_type=standard_artifacts.Model, reimport=True) self.model_2_importer = ImporterNode( instance_name='model_2', source_uri=os.path.join(self._testdata_root, 'trainer', 'current'), artifact_type=standard_artifacts.Model, reimport=True) # ModelBlessing artifact for testing. self.model_blessing_importer = ImporterNode( instance_name='model_blessing', source_uri=os.path.join(self._testdata_root, 'model_validator', 'blessed'), artifact_type=standard_artifacts.ModelBlessing, reimport=True, custom_properties={'blessed': 1})
def setUp(self): super(KubeflowGCPIntegrationTest, self).setUp() # Example artifacts for testing. self.raw_examples_importer = ImporterNode( instance_name='raw_examples', source_uri=[ os.path.join(self._intermediate_data_root, 'csv_example_gen/examples/test-pipeline/train'), os.path.join(self._intermediate_data_root, 'csv_example_gen/examples/test-pipeline/eval') ], artifact_type=standard_artifacts.Examples, reimport=True, split=['train', 'eval']) # Transformed Example artifacts for testing. self.transformed_examples_importer = ImporterNode( instance_name='transformed_examples', source_uri=[ os.path.join( self._intermediate_data_root, 'transform/transformed_examples/test-pipeline/train'), os.path.join( self._intermediate_data_root, 'transform/transformed_examples/test-pipeline/eval') ], artifact_type=standard_artifacts.Examples, reimport=True, split=['train', 'eval']) # Schema artifact for testing. self.schema_importer = ImporterNode( instance_name='schema', source_uri=os.path.join(self._intermediate_data_root, 'schema_gen/output/test-pipeline'), artifact_type=standard_artifacts.Schema, reimport=True) # TransformGraph artifact for testing. self.transform_graph_importer = ImporterNode( instance_name='transform_graph', source_uri=os.path.join( self._intermediate_data_root, 'transform/transform_output/test-pipeline'), artifact_type=standard_artifacts.TransformGraph, reimport=True) # Model artifact for testing. self.model_1_importer = ImporterNode( instance_name='model_1', source_uri=os.path.join(self._intermediate_data_root, 'trainer/output/test-pipeline/1'), artifact_type=standard_artifacts.Model, reimport=True) self.model_2_importer = ImporterNode( instance_name='model_2', source_uri=os.path.join(self._intermediate_data_root, 'trainer/output/test-pipeline/2'), artifact_type=standard_artifacts.Model, reimport=True) # ModelBlessing artifact for testing. self.model_blessing_importer = ImporterNode( instance_name='model_blessing', source_uri=os.path.join(self._intermediate_data_root, 'model_validator/blessing/test-pipeline'), artifact_type=standard_artifacts.ModelBlessing, reimport=True)
def get_tfx_component_list(self, config: Dict[Text, Any]) -> List: """ Creates an inference pipeline out of TFX components. A inference pipeline is used to run a batch of data through a ML model via the BulkInferrer TFX component. Args: config: Dict. Contains a ZenML configuration used to build the data pipeline. Returns: A list of TFX components making up the data pipeline. """ component_list = [] data_config = \ config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS][ keys.InferSteps.DATA] data = DataGen(name=self.datasource.name, source=data_config[StepKeys.SOURCE], source_args=data_config[StepKeys.ARGS]).with_id( GDPComponent.DataGen.name) component_list.extend([data]) # Handle timeseries # TODO: [LOW] Handle timeseries # if GlobalKeys. in train_config: # schema = ImporterNode(instance_name='Schema', # source_uri=spec['schema_uri'], # artifact_type=standard_artifacts.Schema) # # sequence_transform = SequenceTransform( # examples=data.outputs.examples, # schema=schema, # config=train_config, # instance_name=GDPComponent.SequenceTransform.name) # datapoints = sequence_transform.outputs.output # component_list.extend([schema, sequence_transform]) # Load from model_uri model = ImporterNode(instance_name=GDPComponent.Trainer.name, source_uri=self.model_uri, artifact_type=standard_artifacts.Model) model_result = model.outputs.result infer_cfg = config[keys.GlobalKeys.PIPELINE][keys.PipelineKeys.STEPS][ keys.InferSteps.INFER] bulk_inferrer = BulkInferrer(source=infer_cfg[StepKeys.SOURCE], source_args=infer_cfg[StepKeys.ARGS], model=model_result, examples=data.outputs.examples, instance_name=GDPComponent.Inferrer.name) statistics = StatisticsGen( examples=bulk_inferrer.outputs.predictions).with_id( GDPComponent.DataStatistics.name) schema = SchemaGen(statistics=statistics.outputs.output, ).with_id( GDPComponent.DataSchema.name) component_list.extend([model, bulk_inferrer, statistics, schema]) return component_list
def setUp(self): super(KubeflowGCPIntegrationTest, self).setUp() # Transformed Example artifacts for testing. self.transformed_examples_importer = ImporterNode( instance_name='transformed_examples', source_uri=os.path.join(self._testdata_root, 'transform', 'transformed_examples'), artifact_type=standard_artifacts.Examples, reimport=True, properties={'split_names': '["train", "eval"]'}) # Schema artifact for testing. self.schema_importer = ImporterNode( instance_name='schema', source_uri=os.path.join(self._testdata_root, 'schema_gen'), artifact_type=standard_artifacts.Schema, reimport=True) # TransformGraph artifact for testing. self.transform_graph_importer = ImporterNode( instance_name='transform_graph', source_uri=os.path.join(self._testdata_root, 'transform', 'transform_graph'), artifact_type=standard_artifacts.TransformGraph, reimport=True) # Model artifact for testing. self.model_1_importer = ImporterNode( instance_name='model_1', source_uri=os.path.join(self._testdata_root, 'trainer', 'previous'), artifact_type=standard_artifacts.Model, reimport=True) self.model_2_importer = ImporterNode( instance_name='model_2', source_uri=os.path.join(self._testdata_root, 'trainer', 'current'), artifact_type=standard_artifacts.Model, reimport=True) # ModelBlessing artifact for testing. self.model_blessing_importer = ImporterNode( instance_name='model_blessing', source_uri=os.path.join(self._testdata_root, 'model_validator', 'blessed'), artifact_type=standard_artifacts.ModelBlessing, reimport=True, custom_properties={'blessed': 1}) ### Test data and modules for native Keras trainer and tuner. self._penguin_tuner_module = os.path.join(self._MODULE_ROOT, 'tuner_module.py') self.penguin_examples_importer = ImporterNode( instance_name='penguin_examples', source_uri=os.path.join(self._testdata_root, 'penguin', 'data'), artifact_type=standard_artifacts.Examples, reimport=True, properties={'split_names': '["train", "eval"]'}) self.penguin_schema_importer = ImporterNode( instance_name='penguin_schema', source_uri=os.path.join(self._testdata_root, 'penguin', 'schema'), artifact_type=standard_artifacts.Schema, reimport=True)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, user_schema_path: Text, module_file: Text, serving_model_dir: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) # Import user-provided schema. user_schema_importer = ImporterNode(instance_name='import_user_schema', source_uri=user_schema_path, artifact_type=Schema) # Generates schema based on statistics files. Even we use user-provided schema # in downstream components, we still want to generate the schema of the newest # data so that user can compare and optionally update the schema to use. infer_schema = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=user_schema_importer.outputs['result']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=example_gen.outputs['examples'], schema=user_schema_importer.outputs['result'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=user_schema_importer.outputs['result'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator(examples=example_gen.outputs['examples'], model=trainer.outputs['model']) # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=serving_model_dir))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, user_schema_importer, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, pusher ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers], additional_pipeline_args={}, )
def create_pipeline(pipeline_name: Text, pipeline_root: Text, dataset_name: Text, train_steps: data_types.RuntimeParameter, eval_steps: data_types.RuntimeParameter, accuracy_threshold: data_types.RuntimeParameter, ai_platform_training_args: Dict[Text, Text], ai_platform_serving_args: Dict[Text, Text], beam_pipeline_args: List[Text], model_regisrty_uri: Text, enable_cache: Optional[bool] = False) -> pipeline.Pipeline: """Implements the online news pipeline with TFX.""" # Dataset, table and/or 'where conditions' can be passed as pipeline args. query = sql_utils.generate_source_query(dataset_name=dataset_name) # Brings data into the pipeline from BigQuery. example_gen = tfx.components.BigQueryExampleGen(query=query) # Computes statistics over data for visualization and example validation. statistics_gen = tfx.components.StatisticsGen( input_data=example_gen.outputs.examples) # Import schema from local directory. schema_importer = ImporterNode( instance_name='RawSchemaImporter', source_uri=RAW_SCHEMA_DIR, artifact_type=Schema, ) # Performs anomaly detection based on statistics and data schema. validate_stats = tfx.components.ExampleValidator( stats=statistics_gen.outputs.output, schema=schema_importer.outputs.result) # Performs transformations and feature engineering in training and serving. transform = tfx.components.Transform( input_data=example_gen.outputs.examples, schema=schema_importer.outputs.result, module_file=TRANSFORM_MODULE_FILE) # Get the latest blessed model for model validation. latest_model_resolver = tfx.components.ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Train and save model for evaluation and serving. trainer = tfx.components.Trainer( # custom_executor_spec=executor_spec.ExecutorClassSpec( # ai_platform_trainer_executor.GenericExecutor), custom_executor_spec=executor_spec.ExecutorClassSpec( trainer_executor.GenericExecutor), module_file=TRAIN_MODULE_FILE, transformed_examples=transform.outputs.transformed_examples, schema=schema_importer.outputs.result, transform_output=transform.outputs.transform_output, base_model=latest_model_resolver.outputs.model, train_args={'num_steps': train_steps}, eval_args={'num_steps': eval_steps}, custom_config={'ai_platform_training_args': ai_platform_training_args}) # Uses TFMA to compute a evaluation statistics over features of a model. model_evaluator = tfx.components.Evaluator( examples=example_gen.outputs.examples, model=trainer.outputs.model, baseline_model=latest_model_resolver.outputs.model, eval_config=helper.get_eval_config()) # Use a custom AccuracyModelValidator component to validate the model. model_validator = AccuracyModelValidator( eval_results=model_evaluator.outputs.output, model=trainer.outputs.model, accuracy_threshold=accuracy_threshold, slice_accuracy_tolerance=0.15, ) # # Checks whether the model passed the validation steps and pushes the model # # to its destination if check passed. # pusher = tfx.components.Pusher( # custom_executor_spec=executor_spec.ExecutorClassSpec( # ai_platform_pusher_executor.Executor), # model_export=trainer.outputs.output, # model_blessing=model_evaluator.outputs.blessing, # #model_blessing=model_validator.outputs.blessing, # custom_config={'ai_platform_serving_args': ai_platform_serving_args} # ) register = tfx.components.Pusher( model=trainer.outputs.model, model_blessing=model_validator.outputs.blessing, #model_blessing=model_evaluator.outputs.blessing, push_destination=tfx.proto.pusher_pb2.PushDestination( filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem( base_directory=os.path.join(model_regisrty_uri, pipeline_name)))) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ example_gen, statistics_gen, schema_importer, validate_stats, latest_model_resolver, transform, trainer, model_evaluator, model_validator, #pusher register ], enable_cache=enable_cache, beam_pipeline_args=beam_pipeline_args)